def __iter__(self): for filename in self.file_list[0:1]: sent_file = os.path.join(self.annotation_dir, filename) with open(sent_file) as file: lc = LoopTimer(update_after=100) abs_list = [] lastid = None for line in file: if self.print_status: lc.update("Lemma Doc Stream") data = json.loads(line) doc_id = data['id'] xml = data['annotation'] if lastid != doc_id and len(abs_list) > 0: # Yield Stuff yield lastid, abs_list abs_list = [] lastid = doc_id token_list = mf.xml2lemmas(xml) pos_list = mf.xml2pos(xml) for i in range(0, len(token_list)): token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i]) if len(token_cleaned) > 0: for j in range(0, len(token_cleaned)): abs_list.append(token_cleaned[j]) if len(abs_list) > 0: # Yield Stuff yield lastid, abs_list
def __iter__(self): lc = LoopTimer(update_after=self.print_settings["update_after"], avg_length=self.print_settings['avg_length'], target=self.limit) for abstract_id in self.abstracts: path_to_annotation_file = os.path.join(self.path_to_annotations, abstract_id + ".antn") if not os.path.isfile(path_to_annotation_file): print() print(abstract_id + " in db but missing file.") print() continue with open(path_to_annotation_file, "rb") as annotation_file: annotation = pickle.load(annotation_file) document = nlp_to_sent_token(annotation, token_type=self.token_type, clean=self.token_cleaned, lower=self.lower, bigrams=self.bigram, dictionary=self.dictionary) for sentence_id, sentence in enumerate(document): if self.print_status: lc.update("Yield Sentence") if self.output is None: yield sentence elif self.output == 'all': yield abstract_id, sentence_id, sentence
def __iter__(self): for filename in self.file_list[0:1]: sent_file = os.path.join(self.annotation_dir, filename) with open(sent_file) as file: lc = LoopTimer(update_after=100) for line in file: if self.print_status: lc.update("Lemma Para Stream") data = json.loads(line) doc_id = data['id'] para_id = data['paragraphID'] xml = data['annotation'] token_list = mf.xml2lemmas(xml) pos_list = mf.xml2pos(xml) para_list = [] for i in range(0, len(token_list)): token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i]) if len(token_cleaned) > 0: for j in range(0, len(token_cleaned)): para_list.append(token_cleaned[j]) yield doc_id, para_id, para_list
def __iter__(self): for filename in self.file_list[0:1]: sent_file = os.path.join(self.annotation_dir, filename) with open(sent_file) as file: lc = LoopTimer(update_after=100) lastid = None for line in file: if self.print_status: lc.update("Posbigram Sent Stream") data = json.loads(line) xml = data['annotation'] id = data['id'] if lastid != id: para_num = 0 else: para_num += 1 lastid = id token_list = mf.xml2words(xml) pos_list = mf.xml2pos(xml) for i in range(0, len(token_list)): token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i]) if len(token_cleaned) > 0: yield id, para_num, utils.makeBigrams(pos_cleaned)
def findphrasesbyrules(rf_rules): phrases_ = [] lc = LoopTimer(update_after=5000, avg_length=10000, target=len(dep_tree_dict)) for abstract_id, sentence_id in dep_tree_dict.keys(): dep_tree = dep_tree_dict[(abstract_id, sentence_id)] for rf_rule in rf_rules: phrases_.extend(pm.get_phrases(dep_tree, rf_rule)) lc.update("Find Phrases By Rules") return phrases_
def rf_label_ssorc_paragraphs(): dirname = os.path.dirname(__file__) json_dir = os.path.join(dirname, '../../data/processed/ssorc/json') file_list = sorted([ f for f in os.listdir(json_dir) if os.path.isfile(os.path.join(json_dir, f)) and f.endswith('.para') ]) target_file_path = os.path.join( dirname, '../../data/processed/ssorc/rf_targets/targets.json') if os.path.isfile(target_file_path): os.remove(target_file_path) labels = dict() with open(os.path.join(dirname, '../../data/definitions/rf_labels.txt')) as rfdef: for line in rfdef: linesplit = line.split('\t') label = linesplit[0] candidates = linesplit[1].rstrip().split(',') labels[label] = [] for candidate in candidates: if len(candidate) > 0: candidate = candidate.lower() labels[label].append(candidate) labels[label].append(candidate + ":") with open(target_file_path, 'a') as target_file: for filename in file_list: cur_path = os.path.join(json_dir, filename) lt = LoopTimer() with open(cur_path) as file: for idx, file_line in enumerate(file): data = json.loads(file_line) title = data['paragraphTitle'].lower() rf_label = -1 for key in labels: if title in labels[key]: rf_label = key break if rf_label != -1: target_data = {} target_data['id'] = data['id'] target_data['paragraphID'] = data['paragraphID'] target_data['rflabel'] = rf_label json_string = json.JSONEncoder().encode(target_data) target_file.write(json_string + '\n') lt.update("RF Labeling")
def __iter__(self): for filename in self.file_list[0:1]: sent_file = os.path.join(self.annotation_dir, filename) with open(sent_file) as file: lc = LoopTimer(update_after=100) for line in file: if self.print_status: lc.update("XML Para Stream") data = json.loads(line) xml = data['annotation'] doc_id = data['id'] para_num = data['paragraphID'] if xml.startswith('<?xml'): yield doc_id, para_num, xml
def build_scipy_feature_file(dtype, prefix='', num_samples=10000): dirname = os.path.dirname(__file__) dictionary_dir = os.path.join(dirname, '../../data/processed/' + dtype + '/dictionaries') tfidf_dir = os.path.join(dirname, '../../data/processed/' + dtype + '/tfidf') feature_file = os.path.join(dirname, '../../data/processed/' + dtype + '/features/tm_features_' + prefix + '.npz') if os.path.isfile(feature_file): os.remove(feature_file) lemma_dic = gensim.corpora.Dictionary.load(os.path.join(dictionary_dir, 'lemma.dic')) lemma_tfidf = gensim.models.TfidfModel.load(os.path.join(tfidf_dir, 'lemma_model.tfidf')) lemma_corpus = corpora.lemma_doc_stream(dtype) lt = LoopTimer() row = [] col = [] data = [] for idx, lemmas in enumerate(lemma_corpus): if num_samples != -1 and idx == num_samples: break lemma_bow = lemma_dic.doc2bow(lemmas[1]) vec_lemma_tfidf = lemma_tfidf[lemma_bow] for entry in vec_lemma_tfidf: row.append(idx) col.append(entry[0]) data.append(entry[1]) lt.update("Build Features") m = idx + 1 n = len(lemma_dic) row = np.array(row) col = np.array(col) data = np.array(data) feature_vector = scipy.sparse.csc_matrix((data, (row, col)), shape=(m, n)) scipy.sparse.save_npz(feature_file, feature_vector)
def __iter__(self): lc = LoopTimer(update_after=self.print_settings["update_after"], avg_length=self.print_settings['avg_length'], target=self.limit) for abstract_id in self.abstracts: path_to_annotation_file = os.path.join(self.path_to_annotations, abstract_id + ".antn") if not os.path.isfile(path_to_annotation_file): print() print(abstract_id + " in db but missing file.") print() continue with open(path_to_annotation_file, "rb") as annotation_file: annotation = pickle.load(annotation_file) if self.print_status: lc.update("Yield Abstract") if self.output is None: yield annotation elif self.output == 'all': yield abstract_id, annotation
def make_dictionaries(dtype): dirname = os.path.dirname(__file__) inter_dir = os.path.join(dirname, '../../data/interim', dtype) word_dic = gensim.corpora.Dictionary() pos_dic = gensim.corpora.Dictionary() lemma_dic = gensim.corpora.Dictionary() wordbi_dic = gensim.corpora.Dictionary() posbi_dic = gensim.corpora.Dictionary() word_corpus = corpora.word_doc_stream(dtype) wordbigram_corpus = corpora.wordbigram_doc_stream(dtype) pos_corpus = corpora.pos_doc_stream(dtype) posbigram_corpus = corpora.posbigram_doc_stream(dtype) lemma_corpus = corpora.lemma_doc_stream(dtype) lt = LoopTimer() for word_doc, wordbigram_doc, pos_doc, posbigram_doc, lemma_doc in zip( word_corpus, wordbigram_corpus, pos_corpus, posbigram_corpus, lemma_corpus): lemma_dic.add_documents([lemma_doc[1]], prune_at=20000000) word_dic.add_documents([word_doc[1]], prune_at=20000000) wordbi_dic.add_documents([wordbigram_doc[1]], prune_at=20000000) pos_dic.add_documents([pos_doc[1]], prune_at=20000000) posbi_dic.add_documents([posbigram_doc[1]], prune_at=20000000) lt.update("Build Dictionaries") lemma_dic.save(os.path.join(inter_dir, 'full_lemma.dict')) wordbi_dic.save(os.path.join(inter_dir, 'full_wordbi.dict')) word_dic.save(os.path.join(inter_dir, 'full_word.dict')) posbi_dic.save(os.path.join(inter_dir, 'full_posbi.dict')) pos_dic.save(os.path.join(inter_dir, 'full_pos.dict')) print(word_dic) print(wordbi_dic) print(pos_dic) print(posbi_dic) print(lemma_dic)
def make_ssorc_data(): dirname = os.path.dirname(__file__) raw_dir = os.path.join(dirname, '../../data/raw/ssorc') file_list = sorted([ f for f in os.listdir(raw_dir) if os.path.isfile(os.path.join(raw_dir, f)) ]) json_dir = os.path.join(dirname, '../../data/processed/ssorc/json') for filename in file_list[1:2]: cur_path = os.path.join(raw_dir, filename) json_file = os.path.join(json_dir, filename + '.json') if os.path.isfile(json_file): os.remove(json_file) print(filename) lt = LoopTimer() with open(json_file, 'a') as wfile: with open(cur_path) as file: for idx, file_line in enumerate(file): data = json.loads(file_line) if ('year' in data) and ('paperAbstract' in data) and ('doi' in data): if (data['year'] != '') and (len(data['paperAbstract'].split()) > 50) and (data['doi'] != ''): if check_string_for_english(data['paperAbstract']): new_data = {} new_data['year'] = data['year'] new_data['paperAbstract'] = data[ 'paperAbstract'] new_data['id'] = data['doi'] jsonstring = json.JSONEncoder().encode( new_data) wfile.write(jsonstring + '\n') lt.update("Make Data")
def make_dblp_data(): dirname = os.path.dirname(__file__) raw_dir = os.path.join(dirname, '../../data/raw/dblp') data_abstract_file = os.path.join(raw_dir, 'data_abstracts.tsv') data_information_file = os.path.join(raw_dir, 'data_information.tsv') json_dir = os.path.join(dirname, '../../data/processed/dblp/json') json_file = os.path.join(json_dir, 'dblp.json') if os.path.isfile(json_file): os.remove(json_file) with open(data_information_file) as infofile: with open(data_abstract_file) as abstractfile: with open(json_file, 'a') as jfile: lt = LoopTimer() count = 0 for infoline, abstractline in zip(infofile, abstractfile): infodata = infoline.split('\t') abstractdata = abstractline.split('\t') infoID = infodata[0] infoDOI = infodata[1] infoTitle = infodata[2] infoAuthors = infodata[3] infoYear = int(infodata[4]) abstractID = abstractdata[0] abstractContent = abstractdata[1] if check_string_for_english(abstractContent): new_data = {} new_data['year'] = infoYear new_data['paperAbstract'] = abstractContent new_data['id'] = infoDOI jsonstring = json.JSONEncoder().encode(new_data) jfile.write(jsonstring + '\n') count += 1 lt.update(str(count) + " Abstracts added")
def make_deptree(mod_name, dep_tree_dict, dictionary, dep_type='basicDependencies', limit=2000): #dep_type = 'enhancedDependencies' #dep_type = 'enhancedPlusPlusDependencies' connection = mysql.connector.connect( host="localhost", user="******", passwd="thesis", ) cursor = connection.cursor() cursor.execute("USE ssorc;") sq1 = f"SELECT abstract_id FROM abstracts_ml WHERE entities LIKE '%machine learning%' AND annotated=1 LIMIT {limit}" cursor.execute(sq1) print("Collecting Abstracts") abstracts = set() for row in cursor: abstracts.add(row[0]) connection.close() print(f"{len(abstracts)} to build.") size = len(abstracts) annotations = AnnotationStream(abstracts=abstracts, output='all') lc = LoopTimer(update_after=10, avg_length=200, target=size) for abstract_id, annotation in annotations: for sentence in annotation['sentences']: dep_tree = sentence2tree(sentence, dictionary=dictionary, dep_type_=dep_type) sentence_id = int(sentence['index']) if dep_tree is not None: dep_tree_dict[(abstract_id, sentence_id)] = dep_tree lc.update("Build Dep Tree Dict") print() print(f"Size of Dictionary: {len(dictionary)}")
for idx, document in enumerate(corpus): words = document[1] abstract_id = document[0] label = abstract_labels[abstract_id] labels.append(label) bow = dictionary.doc2bow(words) vec_tfidf = tfidf[bow] for entry in vec_tfidf: row.append(idx) col.append(entry[0]) data.append(entry[1]) lt.update("Build Features") m = idx + 1 n = len(dictionary) row = np.array(row) col = np.array(col) data = np.array(data) labels = np.array(labels) feature_vector = scipy.sparse.csc_matrix((data, (row, col)), shape=(m, n)) print(feature_vector.shape) print(labels.shape) scipy.sparse.save_npz(path_to_feature_file, feature_vector)
filerange = [0, 39] filerange[1] = min(filerange[1], 39) blastfile = 1 if filerange[1] == 39 else 0 target = min(39, (filerange[1] - filerange[0])) * 1000000 + blastfile * 219709 key_error = 0 mass_error = 0 prune_error = 0 lt = LoopTimer(update_after=500, avg_length=1000000, target=target) for filename in file_list[filerange[0]:filerange[1]]: cur_path = os.path.join(raw_dir, filename) with open(cur_path) as file: for idx, file_line in enumerate(file): update_string = f"Prep - Count:{count} | key: {key_error} - different: {mass_error} - One Char: {prune_error}" break_p = lt.update(update_string) data = json.loads(file_line) if not all(key in data for key in req_keys): key_error += 1 continue title = data['title'] abstract = data['paperAbstract'] abstract_id = data['id'] year = data['year'] authors = data['authors'] inCitations = data['inCitations'] outCitations = data['outCitations'] entities = data['entities'] journal = data['journalName'].lower()
print(filename) lt = LoopTimer(update_after=50000, avg_length=50000, target=len(dict_frame)) for abstract_id in dict_frame: entities = dict_frame[abstract_id] if 'machine learning' in entities: ml_words.update(set(entities)) ml_abstracts[abstract_id] = entities if 'artificial intelligence' in entities: ai_words.update(set(entities)) ai_abstracts[abstract_id] = entities lt.update("Parse Dict") print() count += 1 filename = f"all_ent_{count}.dict" print(f"Num ml words {len(ml_words)}") print(f"Num ai words {len(ai_words)}") print(f"Num ml abstracts {len(ml_abstracts)}") print(f"Num ai abstracts {len(ai_abstracts)}") with open(os.path.join(panda_path, "ml_entities.pickle"), "wb") as ml_file: pickle.dump(ml_words, ml_file) with open(os.path.join(panda_path, "ai_entities.pickle"), "wb") as ai_file: pickle.dump(ai_words, ai_file)
cursor = connection.cursor() cursor.execute("USE ssorc;") with open(path_to_feature_file, "rb") as feature_file: documents = pickle.load(feature_file) pos = 0 neg = 0 lc = LoopTimer(update_after=50, avg_length=700, target=len(documents)) for abstract_id in documents: features = documents[abstract_id] prediction = mllr.predict_proba(features) if prediction[0][1] > 0.8: label = 1 pos += 1 else: label = -1 neg += 1 sql = f'UPDATE abstracts SET isML = {label} WHERE abstract_id = "{abstract_id}"' cursor.execute(sql) lc.update("Classify") print() print(f"POS: {pos}") print(f"NEG: {neg}") connection.commit() connection.close()
os.mkdir(path_to_mlgenome_features) with open(os.path.join(path_to_mlgenome, "unique_mentions.pickle"), "rb") as handle: mentions = pickle.load(handle) feature_vector = [] lt_target = len(mentions) lt = LoopTimer(update_after=5000, avg_length=10000, target=lt_target) for mention in mentions: m_string = mention["string"] m_is_acronym = mention['is_acronym'] m_vec = mention['m_vec'] #if m_is_acronym: # continue feature_vector.append(m_vec) breaker = lt.update(f"Make Training-Set - {len(feature_vector)}") print(len(feature_vector)) feature_dict = dict() feature_dict["features"] = feature_vector with open(os.path.join(path_to_mlgenome_features, "knn_features.pickle"), "wb") as handle: pickle.dump(feature_dict, handle)
journal_dict[journal] += 1 if venue in venues: if venue not in venue_dict: venue_dict[venue] = 0 venue_dict[venue] += 1 if venue in venues or journal in journals: total_count += 1 if "machine learning" in entities: ml_count += 1 if "artificial intelligence" in entities: ai_count += 1 breaker = lt.update(f"Analyze ({total_count})") sorted_venue = sorted(venue_dict.items(), key=operator.itemgetter(1), reverse=True) sorted_journals = sorted(journal_dict.items(), key=operator.itemgetter(1), reverse=True) print("\n\n") print("Venues") for item in sorted_venue[:20]: print(f"{item[0]}: {item[1]}") print() print("------------------------") print("Journals") for item in sorted_journals[:20]: print(f"{item[0]}: {item[1]}")
lemma_list = list() coarse_pos_list = list() fine_pos_list = list() ent_type_list = list() merged_word_list = list() merged_ent_type_list = list() targ = len(infoDF) not_found = 0 lt = LoopTimer(update_after=100, avg_length=10000, target=targ) for abstract_id, row in infoDF.iterrows(): file_path = os.path.join(path_to_annotations, f"{abstract_id}.spacy") if not os.path.isfile(file_path): not_found += 1 lt.update(f"Create Pandas - {len(abstract_id_list)}") continue doc = Doc(vocab).from_disk(file_path) abstract_id_list.append(abstract_id) word_list.append("\t\t".join([ "\t".join( [token.text for token in sentence if token_conditions(token)]) for sentence in doc.sents ])) lemma_list.append("\t\t".join([ "\t".join( [token.lemma_ for token in sentence if token_conditions(token)]) for sentence in doc.sents ]))
connection = mysql.connector.connect( host="localhost", user="******", passwd="thesis", ) cursor = connection.cursor() cursor.execute("USE ssorc;") cursor.execute("SELECT abstract_id FROM abstracts WHERE annotated=0 and year>1990") lc = LoopTimer(update_after=1000) abstracts_to_process = set() for idx, row in enumerate(cursor): abstracts_to_process.add(row[0]) lc.update("Collect Abstracts to Process") connection.close() print() print("There are " + str(len(abstracts_to_process)) + " files to process") annotators = 'tokenize,ssplit,pos,lemma,depparse' splitter_annotators = 'ssplit' nlp = StanfordCoreNLP('../../../stanford-corenlp-full-2018-02-27') props = {'annotators': annotators, 'pipelineLanguage': 'en', 'outputFormat': 'json'} split_props = {'annotators': splitter_annotators, 'pipelineLanguage': 'en', 'outputFormat': 'json'} lc = LoopTimer(update_after=1, avg_length=500) for idx, abstract_id in enumerate(abstracts_to_process): lc.update("Annotate Abstract " + abstract_id)
sent_info['sent_id'] = sent_id sent_infos.append(sent_info) print_string = "" label_count[label_dic[label_key]] += 1 breaker = 0 for lco in label_count: counting = label_count[lco] if counting >= label_limit: breaker += 1 print_string += f"{lco}: {counting} | " breaker = breaker / len(label_count) #print_string = print_string[:len(print_string)-3] print_string += f"Breaker: {breaker}" lc.update(f"Build AP Features | {print_string}") if breaker >= 1: break if breaker >= 1: break for feature_data in sent_infos: did = feature_data['id'] sid = feature_data['sent_id'] label_key = (did, sid) target = label_dic[label_key] target_vector.append(target)
# gamma='auto') model = svm.SVC(kernel="linear", C=c_para, decision_function_shape='ovo') scores = cross_val_score(model, learning_features, learning_targets, cv=10, n_jobs=-1) mean_score = scores.mean() if mean_score > best_score: best_score = mean_score best_para = c_para score_list.append((c_para, mean_score)) lc.update(f"Best Para: {best_para} - Best Score: {best_score}") print() print(score_list) print() print("Best Reg-Para: " + str(best_para)) print() #best_model = svm.SVC(decision_function_shape='ovo', # C=best_para, # kernel='rbf', # gamma='auto') best_model = svm.SVC(kernel="linear", C=best_para, decision_function_shape='ovo') best_model.fit(learning_features, learning_targets)
if all(key in data for key in req_keys): entities = [entity.lower() for entity in data['entities']] journal = data['journalName'].lower() venue = data['venue'].lower() if journal not in journal_dict: journal_dict[journal] = 0 journal_dict[journal] += 1 if venue not in venue_dict: venue_dict[venue] = 0 venue_dict[venue] += 1 breaker = lt.update(f"Make Data") sorted_venue = sorted(venue_dict.items(), key=operator.itemgetter(1), reverse=True) sorted_journals = sorted(journal_dict.items(), key=operator.itemgetter(1), reverse=True) print("\n\n") print("Venues") for item in sorted_venue[:20]: print(f"{item[0]}: {item[1]}") print() print("------------------------") print("Journals") for item in sorted_journals[:20]: print(f"{item[0]}: {item[1]}")
import os import mysql.connector from src.utils.LoopTimer import LoopTimer path_to_ner = "/media/norpheo/mySQL/db/ssorc/NER" path_to_ml_algo_abstract_save = os.path.join(path_to_ner, "ml_algo_abstract_new.pickle") with open(path_to_ml_algo_abstract_save, "rb") as algo_abstract_file: ml_algo_abstract = pickle.load(algo_abstract_file) connection = mysql.connector.connect( host="localhost", user="******", passwd="thesis", ) cursor = connection.cursor() cursor.execute("USE ssorc;") lc = LoopTimer(update_after=100, avg_length=10000) for idx, abstract_id in enumerate(ml_algo_abstract): sq1 = f'INSERT INTO mlabstracts (abstract_id) VALUES("{abstract_id}") ON DUPLICATE KEY UPDATE abstract_id = ("{abstract_id}")' cursor.execute(sq1) lc.update("Insert Into") connection.commit() connection.close()
for line in label_file: info = line.replace('\n', '').split('\t') if len(info) != 4: print(len(info)) print(line) continue abstract_id = info[0] sent_id = info[1] label = info[3] if label == '0': continue if label not in label_set: label_set[label] = 0 label_set[label] += 1 label_key = (abstract_id, int(sent_id)) targets[label_key] = label lc.update("Make Targets") print() print(f"labels: {label_set}") print(f"Size: {len(targets)}") with open(target_path, 'wb') as target_file: pickle.dump(targets, target_file, protocol=pickle.HIGHEST_PROTOCOL)
for candidate in mentions: c_string = candidate["mention_string"] c_tokens = candidate["mention_tokens"] c_dvec = candidate["doc_vector"] c_svec = candidate["sentence_vector"] c_mvec = candidate["mention_vector"] similarity = mlares.similarity_ratio(m_string, c_string) intersection = mlares.intersection_of_words(m_tokens, c_tokens) is_pfix = mlares.is_prefix(m_tokens, c_tokens) is_ifix = mlares.is_infix(m_tokens, c_tokens) is_sfix = mlares.is_suffix(m_tokens, c_tokens) dvec_sim = mlares.vec_sim(m_dvec, c_dvec) svec_sim = mlares.vec_sim(m_svec, c_svec) mvec_sim = mlares.vec_sim(m_mvec, c_mvec) feat_vec = [ similarity, intersection, is_pfix, is_ifix, is_sfix, dvec_sim, svec_sim, mvec_sim ] entities = (m_string, c_string) entity_linking[entities] = mlares_clf.predict([feat_vec]) lt.update(f"Make Training-Set - {len(entity_linking)}") with open(os.path.join(path_to_mlgenome, "entity_linking.pickle"), "wb") as el_file: pickle.dump(entity_linking, el_file)
ent_counter[entity_name] = { "list": en_idx_list, "counter": en_count } entity = (start_idx, end_idx, "MLALGO") ent_list.append(entity) #print(f"Add: {entity_name}") #pos_string = " ".join([sentence[tid].tag_ for tid in range(start_id, end_id+1)]) #print(f' {pos_string}') if len(ent_list) > 0: entities = {"entities": ent_list} TRAIN_DATA.append((sentence.text, entities)) breaker = lt.update(f"Make TD - {len(TRAIN_DATA)}") #if breaker > 100000: # break print() for entity_name in ent_counter: en_count = ent_counter[entity_name]["counter"] en_list = ent_counter[entity_name]["list"] if en_count <= threshold: for entry in en_list: train_id = entry["sid"] ent_id = entry["entid"] TRAIN_DATA[train_id][1]["entities"][ent_id] = None else:
topic_dist = tm_model.get_topic_dist(tokens) top_n_topics = topic_dist.argsort()[::-1][:5] if year not in year_count: year_count[year] = 0 for topic in track_topics: timeseries[topic][year] = 0 year_count[year] += 1 for topic in track_topics: for topic_n in track_topics[topic]: if topic_n in top_n_topics: timeseries[topic][year] += topic_dist[topic_n] breaker = lc.update("Model Topics") print() for topic in track_topics: years_list = [key for key in year_count.keys() if key < 2018] years_list.sort() x = np.array(years_list) y = np.array([timeseries[topic][year]/year_count[year] for year in years_list]) fig, ax = plt.subplots() ax.plot(x, y) fig.suptitle(f"{topic} - LDA", fontsize=10, y=1.00) start, end = ax.get_xlim() start = int(start) end = int(end)
n_cat = cat_assignment.count(category) if n_cat > top_cat_count: top_cat_count = n_cat top_cat = category location = s_id / n_sents word_vector = sentence.vector features = np.append(word_vector, [location]) target_vector.append(top_cat) feature_vector.append(features) info_count = [(cat, target_vector.count(cat)) for cat in set(target_vector)] breaker = lc.update(f"Make Features - {info_count}") feature_vector = np.array(feature_vector) target_vector = np.array(target_vector) print(feature_vector.shape) print(target_vector.shape) feature_dict = dict() feature_dict["features"] = feature_vector feature_dict["targets"] = target_vector feature_dict["settings"] = settings with open(os.path.join(path_to_rfl, f"{feature_info_name}.pickle"), "wb") as handle: