def fill_from_articles_labels(self, filepath, labels=None, verbose=False): """Fills the dictionaries of this class from a corpus file, optionally only with words of specific labels (e.g. only words labeled with "PER"). The corpus file is expected to contain one article/document per line. The corpus file may contain labels at each word, e.g. "John/PER Doe/PER did yesterday...". All count values and ranks will be automatically estimated. Note: This function is rather slow. Args: filepath: Filepath to the corpus file. labels: Optionally one or more labels. If provided, only words that are annotated with any of these labels will be counted. verbose: Whether to output messages during parsing. """ assert labels is None or isinstance(labels, list) assert labels is None or len(labels) > 0 counts = Counter() self.sum_of_counts = 0 articles = load_articles(filepath, start_at=0) for i, article in enumerate(articles): words = [token.word for token in article.tokens \ if labels is None or token.label in labels] counts.update(words) if verbose and i % 1000 == 0: print("Article %d" % (i)) most_common = counts.most_common() for i, (word, count) in enumerate(most_common): self.word_to_count[word] = count self.word_to_rank[word] = i + 1 self.sum_of_counts += count
def train(args): """Main training method. Does the following: 1. Create a new pycrfsuite trainer object. We will have to add feature chains and label chains to that object and then train on them. 2. Creates the feature (generators). A feature generator might e.g. take in a window of N tokens and then return ["upper=1"] for each token that starts with an uppercase letter and ["upper=0"] for each token that starts with a lowercase letter. (Lists, because a token can be converted into multiple features by a single feature generator, e.g. the case for LDA as a token may be part of multiple topics.) 3. Loads windows from the corpus. Each window has a fixed (maximum) size in tokens. We only load windows that contain at least one label (named entity), so that we don't waste too much time on windows without any label. 4. Generate features for each chain of tokens (window). That's basically described in (2.). Each chain of tokens from a window will be converted to a list of lists. One list at the top level representing each token, then another list for the feature values. E.g. [["w2v=123", "bc=742", "upper=0"], ["w2v=4", "bc=12", "upper=1", "lda4=1"]] for two tokens. 5. Add feature chains and label chains to the trainer. 6. Train. This may take several hours for 20k windows. Args: args: Command line arguments as parsed by argparse.ArgumentParser. """ trainer = pycrfsuite.Trainer(verbose=True) # Create/Initialize the feature generators # this may take a few minutes print("Creating features...") feature_generators = features.create_features() # Initialize the window generator # each window has a fixed maximum size of tokens print("Loading windows...") windows = load_windows(load_articles(cfg.ARTICLES_FILEPATH), cfg.WINDOW_SIZE, feature_generators, only_labeled_windows=True) # Add chains of features (each list of lists of strings) # and chains of labels (each list of strings) # to the trainer. # This may take a long while, especially because of the lengthy POS tagging. # POS tags and LDA results are cached, so the second run through this part will be significantly # faster. print("Adding example windows (up to max %d)..." % (cfg.COUNT_WINDOWS_TRAIN)) examples = generate_examples(windows, nb_append=cfg.COUNT_WINDOWS_TRAIN, nb_skip=cfg.COUNT_WINDOWS_TEST, verbose=True) for feature_values_lists, labels in examples: trainer.append(feature_values_lists, labels) # Train the model # this may take several hours print("Training...") if cfg.MAX_ITERATIONS is not None and cfg.MAX_ITERATIONS > 0: # set the maximum number of iterations of defined in the config file # the optimizer stops automatically after some iterations if this is not set trainer.set_params({'max_iterations': cfg.MAX_ITERATIONS}) trainer.train(args.identifier)
def test_on_mycorpus(args): """Tests on the corpus set in ARTICLES_FILEPATH. Prints a full report, including precision, recall and F1 score per label. Args: args: Command line arguments as parsed by argparse.ArgumentParser. """ print("Testing on mycorpus (%s)..." % (cfg.ARTICLES_FILEPATH)) test_on_articles(args.identifier, load_articles(cfg.ARTICLES_FILEPATH), nb_append=cfg.COUNT_WINDOWS_TEST)
def train(args): """Main training method. Does the following: 1. Create a new pycrfsuite trainer object. We will have to add feature chains and label chains to that object and then train on them. 2. Creates the feature (generators). A feature generator might e.g. take in a window of N tokens and then return ["upper=1"] for each token that starts with an uppercase letter and ["upper=0"] for each token that starts with a lowercase letter. (Lists, because a token can be converted into multiple features by a single feature generator, e.g. the case for LDA as a token may be part of multiple topics.) 3. Loads windows from the corpus. Each window has a fixed (maximum) size in tokens. We only load windows that contain at least one label (named entity), so that we don't waste too much time on windows without any label. 4. Generate features for each chain of tokens (window). That's basically described in (2.). Each chain of tokens from a window will be converted to a list of lists. One list at the top level representing each token, then another list for the feature values. E.g. [["w2v=123", "bc=742", "upper=0"], ["w2v=4", "bc=12", "upper=1", "lda4=1"]] for two tokens. 5. Add feature chains and label chains to the trainer. 6. Train. This may take several hours for 20k windows. Args: args: Command line arguments as parsed by argparse.ArgumentParser. """ #trainer = pycrfsuite.Trainer(verbose=True) # Create/Initialize the feature generators # this may take a few minutes print("Creating features...") feature_generators = features.create_features() # Initialize the window generator # each window has a fixed maximum size of tokens print("Loading windows...") windows = load_windows(load_articles(cfg.ARTICLES_FILEPATH), cfg.WINDOW_SIZE, feature_generators, only_labeled_windows=True) # Add chains of features (each list of lists of strings) # and chains of labels (each list of strings) # to the trainer. # This may take a long while, especially because of the lengthy POS tagging. # POS tags and LDA results are cached, so the second run through this part will be significantly # faster. print("Adding example windows (up to max %d)..." % (cfg.COUNT_WINDOWS_TRAIN)) examples = generate_examples(windows, nb_append=cfg.COUNT_WINDOWS_TRAIN, nb_skip=cfg.COUNT_WINDOWS_TEST, verbose=True) #for feature_values_lists, labels in examples: # trainer.append(feature_values_lists, labels) f = open('/home/nitin.jain/ner_aleju/dataset/outfeatures.txt', 'w') for feature_values_lists, labels in examples: f.write(str(feature_values_lists + ["------"] + labels)) f.close()
def train_lda(): """ Train the LDA model. generate_dictionary() must be called before this method. """ print("------------------") print("Training LDA model") print("------------------") # load dictionary, as generated by generate_dictionary() print("Loading dictionary...") dictionary = gensim.corpora.dictionary.Dictionary.load(cfg.LDA_DICTIONARY_FILEPATH) # generate a mapping from word id to word print("Generating id2word...") id2word = {} for word in dictionary.token2id: id2word[dictionary.token2id[word]] = word # initialize LDA print("Initializing LDA...") lda_model = LdaMulticore(corpus=None, num_topics=cfg.LDA_COUNT_TOPICS, id2word=id2word, workers=LDA_COUNT_WORKERS, chunksize=LDA_CHUNK_SIZE) # Train the LDA model print("Training...") examples = [] update_every_n_windows = 25000 windows = load_windows(load_articles(cfg.ARTICLES_FILEPATH), cfg.LDA_WINDOW_SIZE, only_labeled_windows=True) for i, window in enumerate(windows): tokens_str = [token.word.lower() for token in window.tokens] bow = dictionary.doc2bow(tokens_str) # each window as bag of words examples.append(bow) if len(examples) >= update_every_n_windows: print("Updating (at window %d of max %d)..." % (i, COUNT_EXAMPLES_FOR_LDA)) # this is where the LDA model is trained lda_model.update(examples) examples = [] if i >= COUNT_EXAMPLES_FOR_LDA: print("Reached max of %d windows." % (COUNT_EXAMPLES_FOR_LDA,)) break # i don't update here with the remainder of windows, because im not sure if each update step's # results are heavily influenced/skewed by the the number of examples #if len(examples) > 0: # print("Updating with remaining windows...") # lda_model.update(examples) # save trained model to HDD print("Saving...") lda_model.save(cfg.LDA_MODEL_FILEPATH)
def generate_dictionary(): """Generate the dictionary/vocabulary used for the LDA.""" print("------------------") print("Generating LDA Dictionary") print("------------------") # we generate the dictionary from the same corpus that is also used to find named entities articles = load_articles(cfg.ARTICLES_FILEPATH) articles_str = [] dictionary = gensim.corpora.Dictionary() update_every_n_articles = 1000 # add words to the dictionary for i, article in enumerate(articles): articles_str.append(article.get_content_as_string().lower().split(" ")) if len(articles_str) >= update_every_n_articles: print("Updating (at article %d of max %d)..." % (i, COUNT_EXAMPLES_FOR_DICTIONARY)) dictionary.add_documents(articles_str) articles_str = [] if i > COUNT_EXAMPLES_FOR_DICTIONARY: print("Reached max of %d articles." % (COUNT_EXAMPLES_FOR_DICTIONARY,)) break if len(articles_str) > 0: print("Updating with remaining articles...") dictionary.add_documents(articles_str) print("Loaded %d unique words." % (len(dictionary.keys()),)) # filter some rare words to save space and computation time during training print("Filtering rare words...") rare_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() \ if docfreq < IGNORE_WORDS_BELOW_COUNT] dictionary.filter_tokens(rare_ids) dictionary.compactify() print("Filtered to %d unique words." % (len(dictionary.keys()),)) # save to HDD print("Saving dictionary...") dictionary.save(cfg.LDA_DICTIONARY_FILEPATH)
def train(args): """Main training method. """ trainer = pycrfsuite.Trainer(verbose=True) # Create/Initialize the feature generators # this may take a few minutes print("Creating features...") feature_generators = features.create_features() # Initialize the window generator # each window has a fixed maximum size of tokens print("Loading windows...") windows = load_windows(load_articles(cfg.ARTICLES_FILEPATH), cfg.WINDOW_SIZE, feature_generators, only_labeled_windows=True) # Add chains of features (each list of lists of strings) # and chains of labels (each list of strings) # to the trainer. print("Adding example windows (up to max %d)..." % (cfg.COUNT_WINDOWS_TRAIN)) examples = generate_examples(windows, nb_append=cfg.COUNT_WINDOWS_TRAIN, nb_skip=cfg.COUNT_WINDOWS_TEST, verbose=True) for feature_values_lists, labels, tokens in examples: trainer.append(feature_values_lists, labels) # Train the model # this may take several hours print("Training...") if cfg.MAX_ITERATIONS is not None and cfg.MAX_ITERATIONS > 0: # set the maximum number of iterations of defined in the config file # the optimizer stops automatically after some iterations if this is not set trainer.set_params({'max_iterations': cfg.MAX_ITERATIONS}) trainer.train(args.identifier)