def test(self, path): corp = Corpus(path) bs = Bayesian() count = 0 sender_bl = load_pickle('sender_bl.pickle') # scan email and define if msg is SPAM or HAM # first check if sender occurs in sender Blacklist # then count spamicity of the word using the Bayes approach for fname, body in corp.emails(): sender = find_sender(body) if sender in sender_bl: self.tag_it(path, fname, 'SPAM') continue spamicity_list = [] count += 1 tokens = tokenize(body) # compute spamicity for each word and create list of the values for el in tokens: word_spamicity = [el, bs.word_spamicity(el)] spamicity_list.append(word_spamicity) # prepare list for Bayes spamicity_list = [list(i) for i in set(map(tuple, spamicity_list))] # remove duplicates from list spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True) prediction = bs.bayes_pred(spamicity_list[:15]) # Consider only 15 'words' if prediction > 0.9 or sender in sender_bl: self.tag_it(path, fname, 'SPAM') else: self.tag_it(path, fname, 'OK')
def _process_nonexp_sense(self, articles, which): nonexp_feat_name = FILE_PATH + '/../tmp/nonexp.feat' nonexp_sense_file = codecs.open(nonexp_feat_name, 'w', 'utf-8') nonexpParser = NonExplicit() # change name later for art in articles: self.generate_nonexp_relations(art) for rel in art.nonexp_relations: nonexpParser.print_features(rel, ['xxxxx'], nonexp_sense_file) nonexp_sense_file.close() nonexp_pred_name = FILE_PATH + '/../tmp/nonexp.pred' Corpus.test_with_opennlp(nonexp_feat_name, nonexpParser.model_file, nonexp_pred_name) nonexp_res = [l.strip().split()[-1] for l in codecs.open(nonexp_pred_name, 'r', 'utf-8')] rid = 0 for art in articles: for rel in art.nonexp_relations: pred_sense = nonexp_res[rid] if pred_sense == 'EntRel': r_type = 'EntRel' elif pred_sense == 'NoRel': r_type = 'NoRel' else: r_type = 'Implicit' rel.rel_type = r_type rel.sense = [pred_sense] rid += 1 assert len(nonexp_res) == rid, 'nonexp relations size not match'
def contruir_corpus_experimento(self): '''Contruye el dataset''' c = Corpus() if self.tamanio == 'BI': busquedaInicial=leer_archivo(open(self.directorio+'bi.csv','r'), eliminar_primero=True) clasificados = leer_archivo(open(self.directorio+'clasificados.csv', 'r'),eliminar_primero=True) elif self.tamanio == 'Univ': busquedaInicial=leer_archivo(open(self.directorio++'dataPapers.csv','r'), eliminar_primero=True) clasificados = leer_archivo(open(self.directorio++'validacion.csv', 'r'),eliminar_primero=True) conjuntoA=leer_archivo(open(self.directorio+'a.csv','r'),eliminar_primero=True) conjuntoS=leer_archivo(open(self.directorio+'s.csv','r'),eliminar_primero=True) conjuntoJ=leer_archivo(open(self.directorio+'j.csv','r'),eliminar_primero=True) conjuntoO=leer_archivo(open(self.directorio+'o.csv','r'),eliminar_primero=True) xmls = self.obtener_xmls() #Archivos con los eid de los papers que van a conformar la red ##archivo_papers_red = dividir_archivo_fecha(open(self.directorio+'relevantes.csv'), open(self.directorio+'relevantesFecha.csv'), 2013) archivo_papers_red = open(self.directorio+'bi.csv') #Lista con los eid de los papers que van a conformar la red lista_papers_red = leer_archivo(archivo_papers_red, eliminar_primero=True) #Autores-papers de la red dicci_contruir_red = obtener_autores(xmls, lista_papers_red) #Aqué deberían estar todos los autores-papers del corpus dicci_todos_autores_papers = obtener_autores(xmls, leer_archivo(open(self.directorio+'bi.csv'), eliminar_primero=True)) #c.construir_corpus(self.nombre_corpus, busquedaInicial, conjuntoA, conjuntoS, conjuntoJ, conjuntoO, clasificados, # conjuntos_red=dicci_contruir_red, diccionario_todos_autores=dicci_todos_autores_papers) c.construir_corpus(self.nombre_corpus, busquedaInicial, conjuntoA, conjuntoS, conjuntoJ, conjuntoO, clasificados)
def main(): logging.basicConfig(format=DefaultSetting.FORMAT_LOG, level=logging.INFO) start_time = datetime.now() input_file = 'data/content.with.categories.seg.vni' stopwords_file = 'data/stopwords.txt' num_topics = 100 prefix_name = 'demo' directory = 'tmp' query = 'data/query.txt' corpus = Corpus() corpus.build_corpus(input_file, stopwords_file, directory, prefix_name) LDA = LDAModel() LDA.train(corpus.corpus, corpus.dictionary, num_topics, directory, prefix_name) LDA.show() docsim = DocSim() docsim.set_model(LDA.model) docsim.set_doc(corpus) docsim.vectorized(num_topics) # docsim.save(directory, prefix_name) print 'Training time: ', datetime.now() - start_time start_time = datetime.now() reader = codecs.open(query, 'r', 'utf8') documents = [] for line in reader.readlines(): documents.append(line.replace('\n', '')) docsim.query(documents, True, directory, prefix_name) docsim.query(documents, False, directory, prefix_name) print 'Query time: ', datetime.now() - start_time
def _process_parsed_conn(self, articles, which='test'): """ generate explicit relation for each true discourse connective """ connParser = Connective() conn_feat_name = FILE_PATH + '/../tmp/conn.feat' conn_feat_file = codecs.open(conn_feat_name, 'w', 'utf-8') checked_conns = [] for art in articles: checked_conns.append(connParser.print_features(art, which, conn_feat_file)) conn_feat_file.close() conn_pred_name = FILE_PATH + '/../tmp/conn.pred' Corpus.test_with_opennlp(conn_feat_name, connParser.model_file, conn_pred_name) conn_res = [l.strip().split()[-1] for l in codecs.open(conn_pred_name, 'r', 'utf-8')] assert len(checked_conns) == len(articles), 'article size not match' s = 0 for art, cand_conns in zip(articles, checked_conns): length = len(cand_conns) cand_res = conn_res[s:s+length] s += length for conn, label in zip(cand_conns, cand_res): if label == '1': rel = Relation() rel.doc_id = art.id rel.rel_type = 'Explicit' rel.article = art rel.conn_leaves = conn rel.conn_addr = [n.leaf_id for n in conn] art.exp_relations.append(rel) assert s == len(conn_res), 'conn size not match'
def preprocess(filename, stopword_filename=None, extra_stopwords=None): """ Preprocesses a CSV file and returns ... Arguments: filename -- name of CSV file Keyword arguments: stopword_filename -- name of file containing stopwords extra_stopwords -- list of additional stopwords """ stopwords = create_stopword_list(stopword_filename) stopwords.update(create_stopword_list(extra_stopwords)) corpus = Corpus() for fields in reader(open(filename), delimiter=','): corpus.add(fields[0], tokenize(fields[-1], stopwords)) corpus.freeze() return corpus
def output_json_format(self, parse_path, rel_path): preds = [it.strip().split()[-1] for it in open(self.predicted_file)] rel_dict = Corpus.read_relations(rel_path) idx = 0 for art in Corpus.read_parses(parse_path, rel_dict): for rel in art.relations: if rel.rel_type == 'Explicit': continue pred_sense = preds[idx] json_dict = {} json_dict['DocID'] = rel.doc_id if pred_sense == 'EntRel': r_type = 'EntRel' elif pred_sense == 'NoRel': r_type = 'NoRel' else: r_type = 'Implicit' json_dict['Type'] = r_type json_dict['Sense'] = [pred_sense.replace('_', ' ')] json_dict['Connective'] = {} json_dict['Connective']['TokenList'] = [] json_dict['Arg1'] = {} json_dict['Arg1']['TokenList'] = [] json_dict['Arg2'] = {} json_dict['Arg2']['TokenList'] = [] print json.dumps(json_dict) idx += 1
def test_interro4(): print('Testing interrogation 4') corp = Corpus('data/test-stripped-tokenised') data = corp.interrogate({'n': 'any'}) d = {'and interrogating': {'first': 0, 'second': 2}, 'concordancing and': {'first': 0, 'second': 2}} assert_equals(data.results.to_dict(), d)
def prepare_data(self, parse_path, rel_path, which, to_file): rel_dict = Corpus.read_relations(rel_path) for art in Corpus.read_parses(parse_path, rel_dict): for rel in art.relations: if rel.rel_type != 'Explicit': continue rel.article = art rel.get_conn_leaves() self.print_features(art, which, to_file)
def generateData(): rep = Representor(None, 'citybeat', 'next_week_candidate_event_25by25_merged') corpus = Corpus() corpus.buildCorpusOnDB('citybeat', 'next_week_candidate_event_25by25_merged') true_event_list, false_event_list = loadNextWeekData() EventFeatureTwitter(None).GenerateArffFileHeader() for event in true_event_list + false_event_list: EventFeatureTwitter(event, corpus, rep).printFeatures()
def test_parse(): import shutil print('Testing parser') unparsed = Corpus(unparsed_path) try: shutil.rmtree('data/test-parsed') except: pass parsed = unparsed.parse() assert_equals(list([i.name for i in parsed.files]), ['intro.txt.xml', 'body.txt.xml'])
def test_bb_target_state_halfed(self): feature_table = FeatureTable.load(get_feature_table_fixture("a_b_and_cons_feature_table.json")) constraint_set = ConstraintSet.load(get_constraint_set_fixture("bb_target_constraint_set.json"), feature_table) target_lexicon_words = Corpus.load(get_corpus_fixture("bb_target_lexicon_halfed.txt")).get_words() lexicon = Lexicon(target_lexicon_words, feature_table) grammar = Grammar(feature_table, constraint_set, lexicon) corpus = Corpus.load(get_corpus_fixture("bb_corpus.txt")) traversable_hypothesis = TraversableGrammarHypothesis(grammar, corpus) self.assertEqual(traversable_hypothesis.get_energy(), 407430)
def test_parse_speakseg(skipassert = False): print('Testing parser with speaker segmentation') unparsed = Corpus(unparsed_path) import shutil try: shutil.rmtree(parsed_path) except: pass parsed = unparsed.parse(speaker_segmentation = True) if not skipassert: assert_equals(list([i.name for i in parsed.files]), ['intro.txt.xml', 'body.txt.xml'])
def _get_corpus(self): self.training_corpus = Corpus() self.training_corpus.load_from_file(self.training_corpus_f) self.unlabeled_corpus = Corpus() self.unlabeled_corpus.load_from_file(self.u_corpus_f) self.test_corpus = Corpus() self.test_corpus.load_from_file(self.test_corpus_f) self.user_corpus = Corpus()
def scoring(self, method='zagibolov'): # Supply argument in Corpus to connect to databse. user, password and db. corpus = Corpus(password='', db='project_major') corpus.getTweets() dataset = corpus.dataSet preprocess = Preprocess('zagibolov', self.lexicons, self.negatives, self.stopWords) scoring = Scoring(method, self.lexicons, self.negatives, self.stopWords, self.seeds) j = 0 for data in dataset: preprocess.preprocessScoring(data) processed = preprocess.processed_data for data in processed: scoring.count(data['tweet']) ## print self.seeds preprocess.seeds = scoring.lexicon_count preprocess.processLexicon() scoring.lexicons = preprocess.lexicons ## print scoring.lexicon_count last_score = {} i = 0 for i in range(0,3): total = 0 j = 0 negative = 0 positive = 0 scoring.resetLexiconCount() ## print self.lexicons for data in processed: if j == 50: break j += 1 score = scoring.score(data) if score != 0: total += 1 if score < 0: negative += 1 else: positive += 1 scoring.adjustScoring() if last_score == {}: last_score = scoring.lexicons this_score = last_score else: this_score = scoring.lexicons if this_score == last_score: break else: last_score = this_score print this_score print "Total scored: " + str(total), "Negative: ", negative, "Positive: ", positive print this_score print "Total scored: " + str(total), "Negative: ", negative, "Positive: ", positive
def test_corpusContainsOnlyEmails(self): """Test reading the corpus with email messages only.""" corpus = Corpus(CORPUS_DIR) # Exercise the SUT observed = {} for fname, contents in corpus.emails(): observed[fname] = contents # Verify the results self.assertEqual(len(self.expected), len(observed), 'The emails() method did not generate all the corpus files.') self.assertEqual(self.expected, observed, 'The read file contents are not equal to the expected contents.')
def __init__(self, path, dirname): import os from os.path import join, isfile, isdir self.path = join(dirname, path) kwargs = {'print_info': False, 'level': 'f'} Corpus.__init__(self, self.path, **kwargs) if self.path.endswith('.p'): self.datatype = 'tokens' elif self.path.endswith('.xml'): self.datatype = 'parse' else: self.datatype = 'plaintext'
def test_corpusContainsOnlyEmails(self): """Test reading the corpus with email messages only.""" corpus = Corpus(CORPUSDIR) # Exercise the SUT nitems = 0 for fname, contents in corpus.emails_as_string(): nitems += 1 # Validate the results self.assertEqual(self.expected[fname], contents, 'The read file contents are not equal to the expected contents.') self.assertEqual(nitems, NEMAILS, 'The emails_as_string() method did not return the right number of files.')
def test_aspiration_and_lengthening_extended_augmented_target_state(self): configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 100 configurations["RESTRICTION_ON_ALPHABET"] = True feature_table = FeatureTable.load( get_feature_table_fixture("aspiration_and_lengthening_extended_augmented_feature_table.json")) constraint_set = ConstraintSet.load( get_constraint_set_fixture("aspiration_and_lengthening_augmented_target_constraint_set.json"), feature_table) target_lexicon_words = Corpus.load(get_corpus_fixture("aspiration_and_lengthening_extended_target_lexicon.txt")).get_words() lexicon = Lexicon(target_lexicon_words, feature_table) grammar = Grammar(feature_table, constraint_set, lexicon) corpus = Corpus.load(get_corpus_fixture("aspiration_and_lengthening_extended_corpus.txt")) traversable_hypothesis = TraversableGrammarHypothesis(grammar, corpus)
def test(self, test_corpus_dir): test_corpus = Corpus(test_corpus_dir) with open(os.path.join(test_corpus_dir, "!prediction.txt"), "w+") as a_file: for filename, body in test_corpus.emails(): if self.bayesian_combination(body) > 0.9 or self.get_email_adress(body) in self.black_list: decision = "SPAM" else: if self.get_email_adress(body) in self.white_list: decision = "OK" else: decision = "OK" a_file.write(filename + " " + decision + "\n")
def __init__(self, path, dirname, datatype): import os from os.path import join, isfile, isdir self.path = join(dirname, path) kwargs = {"print_info": False, "level": "f", "datatype": datatype} Corpus.__init__(self, self.path, **kwargs) if self.path.endswith(".p"): self.datatype = "tokens" elif self.path.endswith(".xml"): self.datatype = "parse" else: self.datatype = "plaintext"
def _process_parsed_argpos(self, articles, which='test'): argpos_feat_name = FILE_PATH + '/../tmp/argpos.feat' argpos_feat_file = codecs.open(argpos_feat_name, 'w', 'utf-8') argpos_checked = [] argposParser = ArgPos() for art in articles: for rel in art.exp_relations: argpos_checked.append(argposParser.print_features(rel, which, argpos_feat_file)) argpos_feat_file.close() argpos_pred_name = FILE_PATH + '/../tmp/argpos.pred' Corpus.test_with_opennlp(argpos_feat_name, argposParser.model_file, argpos_pred_name) argpos_res = [l.strip().split()[-1] for l in codecs.open(argpos_pred_name, 'r', 'utf-8')] return argpos_res
def test_load_and_save(self): """Load and save functions must be inverses.""" filename = 'testing_file' self.co.save_to_file(filename) new_co = Corpus() new_co.load_from_file(filename) self.assertTrue(_eq_crs_matrix(new_co.instances, self.co.instances)) for index in range(len(self.co)): self.assertEqual(self.co.full_targets[index], new_co.full_targets[index]) self.assertEqual(self.co.representations[index], new_co.representations[index]) self.assertIsNotNone(new_co.primary_targets)
def run_simulation(configurations_tuples, simulation_number, log_file_template, feature_table_file_name, corpus_file_name, constraint_set_file_name, sample_target_lexicon=None, sample_target_outputs=None, target_lexicon_indicator_function=None, target_constraint_set_file_name=None, target_lexicon_file_name=None, convert_corpus_word_to_target_word_function=None, initial_lexicon_file_name=None): for configurations_tuple in configurations_tuples: configurations[configurations_tuple[0]] = configurations_tuple[1] log_file_name = log_file_template.format(platform.node(), simulation_number) dirname, filename = split(abspath(__file__)) log_file_path = join(dirname, "../logging/", log_file_name) # if os.path.exists(log_file_path): # raise ValueError("log name already exits") logger = logging.getLogger() logger.setLevel(logging.INFO) file_log_formatter = logging.Formatter('%(asctime)s %(levelname)s %(name)s %(message)s', "%Y-%m-%d %H:%M:%S") file_log_handler = logging.FileHandler(log_file_path, mode='w') file_log_handler.setFormatter(file_log_formatter) logger.addHandler(file_log_handler) feature_table = FeatureTable.load(get_feature_table_fixture(feature_table_file_name)) corpus = Corpus.load(get_corpus_fixture(corpus_file_name)) constraint_set = ConstraintSet.load(get_constraint_set_fixture(constraint_set_file_name), feature_table) if initial_lexicon_file_name: corpus_for_lexicon = Corpus.load(get_corpus_fixture(initial_lexicon_file_name)) lexicon = Lexicon(corpus_for_lexicon.get_words(), feature_table) else: lexicon = Lexicon(corpus.get_words(), feature_table) grammar = Grammar(feature_table, constraint_set, lexicon) data = corpus.get_words() traversable_hypothesis = TraversableGrammarHypothesis(grammar, data) keyargs_dict = {} if sample_target_lexicon and sample_target_outputs and target_lexicon_indicator_function: keyargs_dict["sample_target_lexicon"] = sample_target_lexicon keyargs_dict["sample_target_outputs"] = sample_target_outputs keyargs_dict["target_lexicon_indicator_function"] = target_lexicon_indicator_function if target_constraint_set_file_name and (target_lexicon_file_name or convert_corpus_word_to_target_word_function): target_energy = get_target_hypothesis_energy(feature_table, target_constraint_set_file_name, corpus, target_lexicon_file_name, convert_corpus_word_to_target_word_function) keyargs_dict["target_energy"] = target_energy simulated_annealing = SimulatedAnnealing(traversable_hypothesis, **keyargs_dict) simulated_annealing.run()
def test_parse(): import shutil print('Testing parser') unparsed = Corpus(unparsed_path) try: shutil.rmtree('data/test-parsed') except: pass parsed = unparsed.parse() fnames = [] for subc in parsed.subcorpora: for f in subc.files: fnames.append(f.name) assert_equals(fnames, ['intro.txt.xml', 'body.txt.xml'])
def prepare_data(self, parse_path, rel_path, which, to_file): rel_dict = Corpus.read_relations(rel_path) for art in Corpus.read_parses(parse_path, rel_dict): for rel in art.relations: if rel.rel_type != 'Explicit': continue rel.article = art rel.get_conn_leaves() labels = {s.replace(' ','_') for s in rel.sense} labels = {s for s in labels if s in SENSES} if which == 'test': labels = ['|'.join(labels)] self.print_features(rel, labels, which, to_file)
def test_t_aspiration_target_state(self): configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 25 feature_table = FeatureTable.load(get_feature_table_fixture("t_aspiration_feature_table.json")) constraint_set = ConstraintSet.load(get_constraint_set_fixture("t_aspiration_target_constraint_set.json"), feature_table) target_lexicon_words = Corpus.load(get_corpus_fixture("t_aspiration_target_lexicon.txt")).get_words() lexicon = Lexicon(target_lexicon_words, feature_table) grammar = Grammar(feature_table, constraint_set, lexicon) corpus = Corpus.load(get_corpus_fixture("t_aspiration_corpus.txt")) traversable_hypothesis = TraversableGrammarHypothesis(grammar, corpus) configurations["RESTRICTION_ON_ALPHABET"] = True self.assertEqual(traversable_hypothesis.get_energy(), 167838) configurations["RESTRICTION_ON_ALPHABET"] = False self.assertEqual(traversable_hypothesis.get_energy(), 173676)
def test_parse_speakseg(skipassert = False): print('Testing parser with speaker segmentation') unparsed = Corpus(unparsed_path) import shutil try: shutil.rmtree(parsed_path) except: pass parsed = unparsed.parse(speaker_segmentation = True) fnames = [] for subc in parsed.subcorpora: for f in subc.files: fnames.append(f.name) assert_equals(fnames, ['intro.txt.xml', 'body.txt.xml'])
def process_projects(src_directory, glossary_description, glossary_file): corpus = Corpus(src_directory) corpus.process() reference_sources = ReferenceSources() reference_sources.read_sources() metrics = Metrics() metrics.create(corpus) # Select terms MAX_TERMS = 5000 sorted_terms_by_tfxdf = sorted(metrics.tfxdf, key=metrics.tfxdf.get, reverse=True) # Developer report glossary_entries = OrderedDict() translations = Translations() selected_terms = sorted_terms_by_tfxdf[:MAX_TERMS] # Sorted by frequency for term in selected_terms: glossary_entries[term] = translations.create_for_word_sorted_by_frequency(corpus.documents, term, reference_sources) dev_glossary_serializer = DevGlossarySerializer() dev_glossary_serializer.create(u"dev-" + glossary_file + ".html", glossary_description, corpus, glossary_entries, reference_sources) # User report glossary_entries = [] selected_terms = sorted(sorted_terms_by_tfxdf[:MAX_TERMS]) # Sorted by term glossary = Glossary(glossary_description) for term in selected_terms: glossary_entry = GlossaryEntry( term, translations.create_for_word_sorted_by_frequency(corpus.documents, term, reference_sources) ) glossary.entries.append(glossary_entry) glossary_entries = glossary.get_dict() process_template('templates/userglossary-html.mustache', glossary_file + ".html", glossary_entries) process_template('templates/userglossary-csv.mustache', glossary_file + ".csv", glossary_entries) generate_database(glossary, glossary_file)
def test_corpusContainsAlsoSpecialFiles(self): """Test reading the corpus with special files.""" # Add a special file into the corpus dir save_file_to_corpus_dir( fname=SPECIAL_FILENAME, contents='fake', dirname=CORPUS_DIR) corpus = Corpus(CORPUS_DIR) # Exercise the SUT observed = {} for fname, contents in corpus.emails(): observed[fname] = contents # Verify the results self.assertEqual(len(self.expected), len(observed), 'The emails() method did not generate all the corpus files.') self.assertEqual(self.expected, observed, 'The read file contents are not equal to the expected contents.')
def __init__(self, dictionary=None, corpus=None, index_file=None, max_docs=None, **kwargs): Corpus.__init__(self, dictionary=dictionary, corpus=corpus) self.clip_corpus(max_docs) # Set up for KNN features = len(self.dictionary) self.index = AnnoyIndex(features) start_time = datetime.datetime.now() if not index_file: self.transform_corpus(models.TfidfModel) for i, vector in enumerate(self): self.index.add_item(i, list(sparse2full(vector, features).astype(float))) self.index.build(self.no_trees) else: self.index.load(index_file) end_time = datetime.datetime.now() self.train_time = end_time - start_time return
def _get_indices(self, sentence): word_list = list(self._words) indices = [] for token in Corpus.tokenize(sentence): if token in self._words: index = word_list.index(token) indices.append(index) return indices
def build(cls, files): corpora = [] for file in files: ext = os.path.splitext(file)[1] corpus = Corpus(open(file, 'rb'), cls.ext_to_sentiment[ext]) corpora.append(corpus) corpus_set = CorpusSet(corpora) return SentimentClassifier(corpus_set)
def main(): pred_with_tweets = '../data/trial.csv' # predicted labels + tweet text gold = '../data/trial.labels' # file contains gold labels mycorpus = Corpus(pred_with_tweets, gold) myscores = Scorer(mycorpus) myresult = Result() myresult.show(myscores)
def process_corpus(language, source, filelist, corpus_location, verbose): """Create a corpus at corpus_location and run the default pipeline over it.""" pipeline = config.DEFAULT_PIPELINE if language == 'cn': pipeline = config.DEFAULT_PIPELINE_CN pipeline_file = config.DEFAULT_PIPELINE_CONFIGURATION_FILE corpus = Corpus(language=language, datasource=source, source_file=filelist, corpus_path=corpus_location, pipeline_config=pipeline) rconfig = RuntimeConfig(corpus_location, language, source, pipeline_file, verbose=verbose) corpus.run_default_pipeline(rconfig)
def main(): QAfile = sys.argv[1] ReviewFile = sys.argv[2] minReview = int(sys.argv[3]) V = int(sys.argv[4]) k = int(sys.argv[5]) numiter = int(sys.argv[6]) Lambda = float(sys.argv[7]) predictionsOut = sys.argv[8] rankingOut = sys.argv[9] create_corpus = sys.argv[10] # takes zero and 1 as args corpus_pickle_file = "./Data/corpus_{}.pkl".format( QAfile.split("/")[-1].split(".")[0]) if x: corpus = Corpus(QAfile, ReviewFile, minReview, V) corpus.construct_QAnswersAndQPerItem() corpus.construct_SentencesAndSPerItem() corpus.Calculate_PairWiseFeature() with open(corpus_pickle_file, 'wb') as f: pickle.dump(corpus, f) else: with open(corpus_pickle_file, 'rb') as f: corpus = pickle.load(f) print("corpus is available") print(("Vocabulary Size: " + str(corpus.Map.V))) print(("Number of Questions: " + str(len(corpus.QAnswers)))) print(("Number of Reviews: " + str(len(corpus.Sentences)))) print(("Number of Items " + str(len(corpus.Map.ItemIDMap)))) print(("Avg review length " + str(sum(corpus.Avgdl.values()) / len(corpus.Avgdl))))
def train(self, file_path, batch_size=10, learning_rate=0.1, lr_decay=0.05, epochs=1000, momentum=0.0): corpus = Corpus(file_path) truth_dict = utils.read_classification_from_file(file_path + "/!truth.txt") got_data = True mails_getter = corpus.emails() batches = [] # loads all data from directory in batches of given size while got_data: batch = [] # loads a batch of given size, a smaller one if out of data for i in range(batch_size): try: email = next(mails_getter) batch.append( (email[1], 1 if truth_dict[email[0]] == self.pos_tag else 0)) except StopIteration: got_data = False break batches.append(batch) for e in range(epochs): # trains multiple times on all batches self.init_momentums() for batch in batches: # performs gradient descent on each bach # gets feature vectors for batch feature_vectors = [ (m[0].get_feature_vector_plr()) for m in batch ] # gets feature vectors of the batch y = [m[1] for m in batch] # gets the truth vector of the batch for i in range( self.subvector_count ): # weights for each subvector are trained separately subvector_batch = [ v[i] for v in feature_vectors ] # isolates a subvector from all vectors self.gradient_descent(i, y, subvector_batch, learning_rate, momentum) print(f"trained on epoch #{e +1}") learning_rate *= 1 / (1 + lr_decay * e)
def _process_parsed_argpos(self, articles, which='test'): argpos_feat_name = FILE_PATH + '/../tmp/argpos.feat' argpos_feat_file = codecs.open(argpos_feat_name, 'w', 'utf-8') argpos_checked = [] argposParser = ArgPos() for art in articles: for rel in art.exp_relations: argpos_checked.append( argposParser.print_features(rel, which, argpos_feat_file)) argpos_feat_file.close() argpos_pred_name = FILE_PATH + '/../tmp/argpos.pred' Corpus.test_with_opennlp(argpos_feat_name, argposParser.model_file, argpos_pred_name) argpos_res = [ l.strip().split()[-1] for l in codecs.open(argpos_pred_name, 'r', 'utf-8') ] return argpos_res
def main(): nnlm = NNLM() corpus = Corpus.read_corpus(sys.argv[1]) corpus.filter_freq(10000) nnlm.add_corpus(corpus) nnlm.create_model() nnlm.create_algorithm() nnlm.create_training_problem(sys.argv[2]) nnlm.trainer.main_loop()
def __init__(self, frontier): self.frontier = frontier self.corpus = Corpus() self.subdomains = defaultdict(int) self.most_valid_links = (None,-1) self.downloaded_urls = set() self.traps = set() # list of all links ever added to the frontier, w/o scheme self.front = [] # set of links that have already been parsed self.dup = set() # for comparing url similarities self.compare_url = None self.similar_url_count = 0 self.compare_traps = set()
def main(): parser = argparse.ArgumentParser('LM main') parser.add_argument('--corpus', type=str, default='tiny_corpus.txt', help='corpus to train') parser.add_argument('--batch_size', type=int, default=128) parser.add_argument('--emb_dim', type=int, default=128) parser.add_argument('--num_layers', type=int, default=1) parser.add_argument('--drop', type=float, default=0.1) parser.add_argument('--num_workers', type=int, default=4) parser.add_argument('--lr', type=float, default=0.1) parser.add_argument('--momentum', type=float, default=.99) parser.add_argument('--clip_norm', type=float, default=5) parser.add_argument('--epochs', type=int, default=100) parser.add_argument('--device', type=str, default='cpu') parser.add_argument('--save', type=str, default='model.pt') parser.add_argument('--load', type=str, default=None) parser.add_argument('--arpa', type=str, default='tiny_corpus.arpa') args = parser.parse_args() corpus = Corpus(args.corpus) loader = CorpusLoader(corpus, args.batch_size, True, args.num_workers) if args.load is None: extractor = EmbeddingExtractor(corpus.vocab, args.emb_dim, args.device) network = Network(extractor, args.num_layers, drop=args.drop).to(args.device) else: network = torch.load(args.load, map_location=args.device) network.extractor.device = args.device network.rnn.flatten_parameters() ken_lm = kenlm.LanguageModel(args.arpa) optimizer = torch.optim.SGD(network.parameters(), args.lr, args.momentum) loss_fn = torch.nn.CrossEntropyLoss(reduction='none') scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10, verbose=True, factor=.5) min_loss = float('inf') for epoch in range(args.epochs): pred = generate(network, device=args.device) gen_sentence = ' '.join(pred) ppl = ken_lm.perplexity(gen_sentence) print('%s\nPPL:\t%f' % (gen_sentence, ppl)) loss = single_epoch(network, loader, optimizer, loss_fn, args.clip_norm) print('epochs %d \t loss %.3f' % (epoch, loss)) scheduler.step(loss) if min_loss > loss: min_loss = loss print('saving to %s' % args.save) torch.save(network, args.save) print()
def channels_from_paths(self, paths, channels=[]): for path in paths: name = path.split('/')[1:] if not self.name_in_channels(name, channels): source_text = file(path).read() corpus = Corpus(source_text, name) channels.append( Channel(self, self.textbox, corpus, len(channels))) return channels
def get_answer_candidates(corpus_dict: TYPE_CORPUS_DICT): ac = AnswerCandidate() for name, path in corpus_dict.items(): corpus = ac.add_candidates_to_corpus(Corpus(path, name)) corpus.save('data/candidate_answer_{}.pkl'.format(name)) corpus_dict[name] = corpus ac.cache_similarity_query() # ac.cache_relation_paths() return corpus_dict
def analysis(corpus): dictionary = Dictionary() dictionary.load_dictionary() pairs = dictionary.get_pairs() diacritics, no_diacritics = Corpus().get_dictionaries_frequencies_and_sentences(corpus, pairs) update_pairs(pairs, diacritics, no_diacritics) diacritics_corpus = 0 position = 0 not_found_in_corpus_pos = {} with open('diacritics.csv', 'w') as writer: msg = f"diacritic_word\tdiacritic_pos\tdiacritic_freq\t" msg += f"no_diacritic_word\tno_diacritic_pos\tno_diacritic_freq\t" msg += f"total_freq\tcnt\n" writer.write(msg) for pair in pairs.values(): diacritic = pair.diacritic no_diacritic = pair.no_diacritic if diacritic.frequency == 0 and no_diacritic.frequency == 0: logging.debug(f"Frequency 0: {diacritic.word}, {diacritic.pos}") pos = diacritic.pos if pos in not_found_in_corpus_pos: counter = not_found_in_corpus_pos[pos] else: counter = 0 counter = counter + 1 not_found_in_corpus_pos[pos] = counter continue total_freq = diacritic.frequency + no_diacritic.frequency msg = f"{diacritic.word}\t{diacritic.pos}\t{diacritic.frequency}\t" msg += f"{no_diacritic.word}\t{no_diacritic.pos}\t{no_diacritic.frequency}\t{total_freq}\t{position}\n" diacritics_corpus = diacritics_corpus + 1 position = position + 1 writer.write(msg) diacritics_dict = len(pairs) pdiacritics_dict = diacritics_dict * 100 / len(dictionary.words) pdiacritics_corpus = diacritics_corpus * 100 / diacritics_dict logging.info(f"Total unique words in dictionary: {len(dictionary.words)}") logging.info(f"Words with diacritic/no diacritic version {diacritics_dict} ({pdiacritics_dict:.2f}%) (in dictionary)") logging.info(f"Words with diacritic/no diacritic {diacritics_corpus} ({pdiacritics_corpus:.2f}%) (in corpus)") len_pos = total = sum(int(v) for v in not_found_in_corpus_pos.values()) for pos in not_found_in_corpus_pos: counter = not_found_in_corpus_pos[pos] pcounter = counter * 100 / len_pos logging.info(f"Not found in corpus, grammar category {pos} - number: {counter} ({pcounter:.2f}%)") return pairs
def prepare_data(self, parse_path, rel_path, which, to_file): rel_dict = Corpus.read_relations(rel_path) articles = [] dist = defaultdict(int) for art in Corpus.read_parses(parse_path, rel_dict): articles.append(art) for rel in art.relations: rel.article = art rel.get_arg_leaves() if rel.rel_type == 'Explicit': continue labels = {s.replace(' ', '_') for s in rel.sense} for l in labels: dist[l] += 1 if which == 'test': labels = ['|'.join(labels)] self.print_features(rel, labels, to_file) # add NoRel relations for art in articles: for s1, s2 in zip(art.sentences[:-1], art.sentences[1:]): if not art.has_inter_relation(s1.id): rel = Relation() rel.article = art rel.doc_id = art.id rel.arg1s['parsed'] = [s1.tree.root ] if not s1.tree.is_null() else [] rel.arg1_leaves = self.remove_leading_tailing_punc( s1.leaves) rel.arg1_addr = [n.leaf_id for n in rel.arg1_leaves] rel.arg1_sid = rel.arg1_leaves[-1].goto_tree( ).sent_id if len(rel.arg1_leaves) > 0 else -1 rel.arg1_text = ' '.join(n.value for n in rel.arg1_leaves) rel.arg2s['parsed'] = [s2.tree.root ] if not s2.tree.is_null() else [] rel.arg2_leaves = self.remove_leading_tailing_punc( s2.leaves) rel.arg2_addr = [n.leaf_id for n in rel.arg2_leaves] rel.arg2_sid = rel.arg2_leaves[0].goto_tree( ).sent_id if len(rel.arg2_leaves) > 0 else -1 rel.arg2_text = ' '.join(n.value for n in rel.arg2_leaves) self.print_features(rel, ['NoRel'], to_file)
def test_corpusContainsAlsoSpecialFiles(self): """Test reading the corpus with special files.""" # Add a special file into the corpus dir save_file_to_corpus_dir(fname=SPECIAL_FILENAME, contents='fake', dirname=CORPUS_DIR) corpus = Corpus(CORPUS_DIR) # Exercise the SUT observed = {} with replaced_open(): for fname, contents in corpus.emails(): observed[fname] = contents # Verify the results self.assertEqual( len(self.expected), len(observed), 'The emails() method did not generate all the corpus files.') self.assertEqual( self.expected, observed, 'The read file contents are not equal to the expected contents.')
def genSpecVec(self, origin_line): '''Calculate spec_vec for a line. @param origin_line: an origin line fetched from database, like ("asngy033", "zzy", "bhuv", ...). @return The spec_vec for the line, in the format like [0.3, 0.1, -3.2, ...]. The dim is determined by Word2Vec's model. ''' line = Corpus.addFieldForSingle(origin_line) return self.int_genSpecVec(line)
def test_get_doc_topic_sims(self): actual_output = Cp.get_doc_topic_sims(doc_embeddings, topic_embeddings) desired_output = { # {doc-id: {topic/cluster-label: similarity}} 1: np.array([0.9486832980505138, 0.9486832980505138]), 10: np.array([0.9761870601839528, 0.6507913734559685]), 3: np.array([0.6507913734559685, 0.9761870601839528]) } actual_output = {k: list(v) for k, v in actual_output.items()} desired_output = {k: list(v) for k, v in desired_output.items()} self.assertEqual(actual_output, desired_output)
def parse(self, corenlppath=False, operations=False, copula_head=True, speaker_segmentation=False, memory_mb=False, *args, **kwargs): """ Parse an unparsed corpus, saving to disk :param corenlppath: folder containing corenlp jar files :type corenlppath: str :param operations: which kinds of annotations to do :type operations: str :param speaker_segmentation: add speaker name to parser output if your corpus is script-like: :type speaker_segmentation: bool :param memory_mb: Amount of memory in MB for parser :type memory_mb: int :param copula_head: Make copula head in dependency parse :type copula_head: bool :Example: >>> parsed = corpus.parse(speaker_segmentation = True) >>> parsed <corpkit.corpus.Corpus instance: speeches-parsed; 9 subcorpora> :returns: The newly created :class:`corpkit.corpus.Corpus` """ from make import make_corpus from corpus import Corpus #from process import determine_datatype #dtype, singlefile = determine_datatype(self.path) if self.datatype != 'plaintext': raise ValueError( 'parse method can only be used on plaintext corpora.') kwargs.pop('parse', None) kwargs.pop('tokenise', None) return Corpus( make_corpus(self.path, parse=True, tokenise=False, corenlppath=corenlppath, operations=operations, copula_head=copula_head, speaker_segmentation=speaker_segmentation, memory_mb=memory_mb, *args, **kwargs))
def train(self, file_path): self.content_spam_dict = {} self.content_ham_dict = {} class_dict = utils.read_classification_from_file(file_path + '/!truth.txt') corpus = Corpus(file_path) email_generator = corpus.emails() content_counter_spam = Counter() content_counter_ham = Counter() content_wordcount_spam = 0 content_wordcount_ham = 0 spam_count = 0 ham_count = 0 every_word_content = set() for mail in email_generator: content_words = self.string_to_words(mail[1].content_no_html) content_counter = Counter(content_words) for word in content_words: every_word_content.add(word) if class_dict[mail[0]] == self.pos_tag: spam_count += 1 content_counter_spam += content_counter content_wordcount_spam += len(content_words) else: ham_count += 1 content_counter_ham += content_counter content_wordcount_ham += len(content_words) for word in every_word_content: content_counter_ham[word] += 1 content_counter_spam[word] += 1 self.content_spam_dict[ word] = content_counter_spam[word] / content_wordcount_spam self.content_ham_dict[ word] = content_counter_ham[word] / content_wordcount_ham self.spam_probability = spam_count / (spam_count + ham_count) self.ham_probability = ham_count / (spam_count + ham_count) self.trained = True
def _process_exp_sense(self, articles, which='test'): exp_feat_name = FILE_PATH + '/../tmp/exp.feat' expParser = Explicit() exp_sense_file = codecs.open(exp_feat_name, 'w', 'utf-8') for art in articles: for rel in art.exp_relations: expParser.print_features(rel, ['xxxxx'], which, exp_sense_file) exp_sense_file.close() exp_pred = FILE_PATH + '/../tmp/exp.pred' Corpus.test_with_opennlp(exp_feat_name, expParser.model_file, exp_pred) exp_res = [ l.strip().split()[-1] for l in codecs.open(exp_pred, 'r', 'utf-8') ] rid = 0 for art in articles: for rel in art.exp_relations: pred_sense = exp_res[rid] rel.sense = [pred_sense] rid += 1
def generateData2(_182, sparse=False): # if sparse: rep = Representor() corpus = Corpus() corpus.buildCorpusOnDB('citybeat', 'candidate_event_25by25_merged') true_event_list, false_event_list = loadUnbalancedData(_182) if sparse: word_index, word_list = getCorpusWordList( rep, true_event_list + false_event_list) EventFeatureSparse(None).GenerateArffFileHeader(word_list) else: EventFeatureTwitter(None).GenerateArffFileHeader() for event in true_event_list + false_event_list: if not sparse: EventFeatureTwitter(event, corpus, rep).printFeatures() else: EventFeatureSparse(event, corpus, rep).printFeatures(word_index)
def _process_exp_sense(self, articles, which='test'): exp_feat_name = FILE_PATH + '/../tmp/exp.feat' expParser = Explicit() exp_sense_file = open(exp_feat_name, 'w') for art in articles: for rel in art.exp_relations: expParser.print_features(rel, ['Conjunction'], which, exp_sense_file) exp_sense_file.close() exp_vec = FILE_PATH + '/../tmp/exp.vec' exp_pred = FILE_PATH + '/../tmp/exp.pred' # Corpus.test_with_svm(exp_feat_name, expParser.feat_map_file, exp_vec, expParser.model_file, exp_pred) Corpus.test_with_opennlp(exp_feat_name, expParser.model_file, exp_pred) exp_res = [LABEL_SENSES_MAP[l.strip().split()[-1]] for l in open(exp_pred, 'r')] rid = 0 for art in articles: for rel in art.exp_relations: pred_sense = exp_res[rid] rel.sense = [pred_sense] rid += 1
def read_data(e_path, f_path): """ Combine two halves of a parallel corpus into one. :param e_path: path to language 1 file. :param f_path: path to language 2 file. :return: a list of tuples with parallel sentences. Sentences consist of a list of tokens. """ fe = open(e_path, "r").read() ff = open(f_path, "r").read() corpus = Corpus() for e, f in zip(fe.split(" \n"), ff.split(" \n")): e = e.split(' ') f = f.split(' ') corpus.add_foreign(set(f)) corpus.corpus.append((Sentence(e, pad=True), Sentence(f))) return corpus
def main(): data = Corpus() dotted_path = data.data_directory ends = "ji.json" # 控制对哪些后缀的数据进行处理,可选后缀:ji.json, corpus.json files = data.list_corpus_files(dotted_path, ends) for f in files: print(f) corpus = data.load_corpus(dotted_path, ends) pattern_answer = [] for topic in corpus: for dialog in topic: if len(dialog) == 2: pattern = sent_2_pattern(dialog[0]) pattern_answer.append([pattern, dialog[1]]) topic_script = generate_script(pattern_answer) save_2_file(topic_script, topic_profile)
class TestCorpusSet(unittest.TestCase): def setUp(self): self.negative = StringIO(u'I hated that so much') self.negative_corpus = Corpus(self.negative, 'negative') self.positive = StringIO(u'loved movie!! loved') self.positive_corpus = Corpus(self.positive, 'positive') def test_trivial(self): """ consumes multiple files and turns it into sparse vectors """ self.assertEqual('negative', self.negative_corpus.sentiment) def test_tokenize1(self): """ downcases all the word tokens """ self.assertListEqual(['quick', 'brown', 'fox'], Corpus.tokenize('Quick Brown Fox')) def test_tokenize2(self): """ ignores all stop symbols """ self.assertListEqual(['hello'], Corpus.tokenize('"\'hello!?!?!.\'" ')) def test_tokenize3(self): """ ignores the unicode space """ self.assertListEqual(['hello', 'bob'], Corpus.tokenize(u'hello\u00A0bob')) def test_positive(self): """ consumes a positive training set """ self.assertEqual('positive', self.positive_corpus.sentiment) def test_words(self): """ consumes a positive training set and unique set of words """ self.assertEqual({'loved', 'movie'}, self.positive_corpus.get_words()) def test_sentiment_code_1(self): """ defines a sentiment_code of 1 for positive """ self.assertEqual(1, Corpus(StringIO(u''), 'positive').sentiment_code) def test_sentiment_code_minus1(self): """ defines a sentiment_code of 1 for positive """ self.assertEqual(-1, Corpus(StringIO(u''), 'negative').sentiment_code)
def test_get_relevant_docs(self): clus_terms = { # {cluster-label: {A set of term-ids}} 0: {2, 4}, 1: {1, 3} } actual_output = Cp.get_relevant_docs(clus_terms, df) desired_output = { # {cluster-label: {A set of doc-ids}} 0: {6, 7, 9}, 1: {4, 8, 9} } self.assertEqual(actual_output, desired_output)
def run(config, output_dir, num_rep=5, valid_split=0.2, patience=0): use_cuda = torch.cuda.is_available() mean = 0.0 ## barbara vocab_file = 'data/twitter_hashtag/1kthashtag.vocab' dataset_file = 'data/twitter_hashtag/multiple.txt' emb = load_glove_embedding('data/twitter_hashtag/1kthashtag.glove') criterion = nn.CrossEntropyLoss() corpus = TwitterHashtagCorpus(train_file=dataset_file, vocab_file=vocab_file) config.vocab_size = corpus.vocab_size train_corpus = Corpus() train_corpus.x_data = corpus.x_train[:1000] train_corpus.y_data = corpus.y_train[:1000] valid_corpus = Corpus() valid_corpus.x_data = corpus.x_validation[:1000] valid_corpus.y_data = corpus.y_validation[:1000] metrics = {'accuracy': skmetrics.accuracy_score} for rep in range(1, num_rep + 1): model = TextCNN(config=config, pre_trained_emb=emb) optimizer = optim.Adam(model.parameters(), lr=config.learning_rate) #train_corpus, valid_corpus = corpus.split(valid_split=valid_split) output_dir_rep = os.path.join(output_dir, "rep{}".format(rep)) t = Trainer(train_corpus=train_corpus, valid_corpus=valid_corpus, test_corpus=None, model=model, config=config, criterion=criterion, optimizer=optimizer, verbose=False, output_dir=output_dir_rep, train_metrics=metrics, val_metrics=metrics, selection_metric='accuracy', use_cuda=use_cuda) res = t.train(tqdm_prefix="Rep {}/{}".format(rep, num_rep), patience=patience, init_res_dict={"rep": rep}) pprint(res["best"]) mean = mean + res['best']['selection_metric'] mean = mean / num_rep print(mean)
def main(): args = get_args() corpus = Corpus("text8", args.gram_min, args.gram_max, args.part == "part") subword_embeddings, _ = train_fasttext(corpus, ns_num=args.ns, window_size=5, dimension=100, learning_rate=0.01, epoch=1, subsampling=True) find_similar_words(corpus, subword_embeddings, args.gram_min, args.gram_max)
def prepare_data(self, parse_path, rel_path, which, to_file): count = 0 processed = [] rel_dict = Corpus.read_relations(rel_path) for art in Corpus.read_parses(parse_path, rel_dict): for rel in art.relations: if rel.rel_type != 'Explicit': continue rel.article = art rel.get_conn_leaves() rel.get_arg_leaves() # add a filter function (2015/9/29) if which == 'train' and not self.need_extract(rel): continue count += 1 processed.append(self.print_features(rel, which, to_file)) print >> logs, "processed %d instances" % count return processed