Ejemplo n.º 1
0
    def __iter__(self):
        for filename in self.file_list[0:1]:
            sent_file = os.path.join(self.annotation_dir, filename)
            with open(sent_file) as file:
                lc = LoopTimer(update_after=100)
                abs_list = []

                lastid = None
                for line in file:
                    if self.print_status:
                        lc.update("Lemma Doc Stream")

                    data = json.loads(line)

                    doc_id = data['id']
                    xml = data['annotation']

                    if lastid != doc_id and len(abs_list) > 0:
                        # Yield Stuff
                        yield lastid, abs_list
                        abs_list = []

                    lastid = doc_id
                    token_list = mf.xml2lemmas(xml)
                    pos_list = mf.xml2pos(xml)

                    for i in range(0, len(token_list)):
                        token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i])

                        if len(token_cleaned) > 0:
                            for j in range(0, len(token_cleaned)):
                                abs_list.append(token_cleaned[j])
                if len(abs_list) > 0:
                    # Yield Stuff
                    yield lastid, abs_list
Ejemplo n.º 2
0
    def __iter__(self):
        lc = LoopTimer(update_after=self.print_settings["update_after"],
                       avg_length=self.print_settings['avg_length'],
                       target=self.limit)

        for abstract_id in self.abstracts:
            path_to_annotation_file = os.path.join(self.path_to_annotations, abstract_id + ".antn")
            if not os.path.isfile(path_to_annotation_file):
                print()
                print(abstract_id + " in db but missing file.")
                print()
                continue

            with open(path_to_annotation_file, "rb") as annotation_file:
                annotation = pickle.load(annotation_file)

            document = nlp_to_sent_token(annotation,
                                         token_type=self.token_type,
                                         clean=self.token_cleaned,
                                         lower=self.lower,
                                         bigrams=self.bigram,
                                         dictionary=self.dictionary)

            for sentence_id, sentence in enumerate(document):
                if self.print_status:
                    lc.update("Yield Sentence")
                if self.output is None:
                    yield sentence
                elif self.output == 'all':
                    yield abstract_id, sentence_id, sentence
Ejemplo n.º 3
0
    def __iter__(self):
        for filename in self.file_list[0:1]:
            sent_file = os.path.join(self.annotation_dir, filename)
            with open(sent_file) as file:
                lc = LoopTimer(update_after=100)

                for line in file:
                    if self.print_status:
                        lc.update("Lemma Para Stream")

                    data = json.loads(line)

                    doc_id = data['id']
                    para_id = data['paragraphID']
                    xml = data['annotation']

                    token_list = mf.xml2lemmas(xml)
                    pos_list = mf.xml2pos(xml)

                    para_list = []
                    for i in range(0, len(token_list)):
                        token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i])

                        if len(token_cleaned) > 0:
                            for j in range(0, len(token_cleaned)):
                                para_list.append(token_cleaned[j])
                    yield doc_id, para_id, para_list
Ejemplo n.º 4
0
    def __iter__(self):
        for filename in self.file_list[0:1]:
            sent_file = os.path.join(self.annotation_dir, filename)
            with open(sent_file) as file:
                lc = LoopTimer(update_after=100)

                lastid = None
                for line in file:
                    if self.print_status:
                        lc.update("Posbigram Sent Stream")

                    data = json.loads(line)

                    xml = data['annotation']
                    id = data['id']
                    if lastid != id:
                        para_num = 0
                    else:
                        para_num += 1
                    lastid = id

                    token_list = mf.xml2words(xml)
                    pos_list = mf.xml2pos(xml)

                    for i in range(0, len(token_list)):
                        token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i])

                        if len(token_cleaned) > 0:
                            yield id, para_num, utils.makeBigrams(pos_cleaned)
Ejemplo n.º 5
0
def findphrasesbyrules(rf_rules):
    phrases_ = []
    lc = LoopTimer(update_after=5000,
                   avg_length=10000,
                   target=len(dep_tree_dict))
    for abstract_id, sentence_id in dep_tree_dict.keys():
        dep_tree = dep_tree_dict[(abstract_id, sentence_id)]

        for rf_rule in rf_rules:
            phrases_.extend(pm.get_phrases(dep_tree, rf_rule))
        lc.update("Find Phrases By Rules")
    return phrases_
Ejemplo n.º 6
0
def rf_label_ssorc_paragraphs():
    dirname = os.path.dirname(__file__)
    json_dir = os.path.join(dirname, '../../data/processed/ssorc/json')
    file_list = sorted([
        f for f in os.listdir(json_dir)
        if os.path.isfile(os.path.join(json_dir, f)) and f.endswith('.para')
    ])

    target_file_path = os.path.join(
        dirname, '../../data/processed/ssorc/rf_targets/targets.json')
    if os.path.isfile(target_file_path):
        os.remove(target_file_path)

    labels = dict()
    with open(os.path.join(dirname,
                           '../../data/definitions/rf_labels.txt')) as rfdef:
        for line in rfdef:
            linesplit = line.split('\t')
            label = linesplit[0]
            candidates = linesplit[1].rstrip().split(',')
            labels[label] = []
            for candidate in candidates:
                if len(candidate) > 0:
                    candidate = candidate.lower()
                    labels[label].append(candidate)
                    labels[label].append(candidate + ":")

    with open(target_file_path, 'a') as target_file:
        for filename in file_list:
            cur_path = os.path.join(json_dir, filename)

            lt = LoopTimer()
            with open(cur_path) as file:
                for idx, file_line in enumerate(file):
                    data = json.loads(file_line)
                    title = data['paragraphTitle'].lower()

                    rf_label = -1
                    for key in labels:
                        if title in labels[key]:
                            rf_label = key
                            break

                    if rf_label != -1:
                        target_data = {}
                        target_data['id'] = data['id']
                        target_data['paragraphID'] = data['paragraphID']
                        target_data['rflabel'] = rf_label

                        json_string = json.JSONEncoder().encode(target_data)
                        target_file.write(json_string + '\n')
                    lt.update("RF Labeling")
Ejemplo n.º 7
0
    def __iter__(self):
        for filename in self.file_list[0:1]:
            sent_file = os.path.join(self.annotation_dir, filename)
            with open(sent_file) as file:
                lc = LoopTimer(update_after=100)

                for line in file:
                    if self.print_status:
                        lc.update("XML Para Stream")

                    data = json.loads(line)

                    xml = data['annotation']
                    doc_id = data['id']
                    para_num = data['paragraphID']

                    if xml.startswith('<?xml'):
                        yield doc_id, para_num, xml
Ejemplo n.º 8
0
def build_scipy_feature_file(dtype, prefix='', num_samples=10000):
    dirname = os.path.dirname(__file__)
    dictionary_dir = os.path.join(dirname, '../../data/processed/' + dtype + '/dictionaries')
    tfidf_dir = os.path.join(dirname, '../../data/processed/' + dtype + '/tfidf')
    feature_file = os.path.join(dirname, '../../data/processed/' + dtype + '/features/tm_features_' + prefix + '.npz')

    if os.path.isfile(feature_file):
        os.remove(feature_file)

    lemma_dic = gensim.corpora.Dictionary.load(os.path.join(dictionary_dir, 'lemma.dic'))

    lemma_tfidf = gensim.models.TfidfModel.load(os.path.join(tfidf_dir, 'lemma_model.tfidf'))

    lemma_corpus = corpora.lemma_doc_stream(dtype)

    lt = LoopTimer()

    row = []
    col = []
    data = []
    for idx, lemmas in enumerate(lemma_corpus):
        if num_samples != -1 and idx == num_samples:
            break

        lemma_bow = lemma_dic.doc2bow(lemmas[1])
        vec_lemma_tfidf = lemma_tfidf[lemma_bow]

        for entry in vec_lemma_tfidf:
            row.append(idx)
            col.append(entry[0])
            data.append(entry[1])
        lt.update("Build Features")

    m = idx + 1
    n = len(lemma_dic)

    row = np.array(row)
    col = np.array(col)
    data = np.array(data)

    feature_vector = scipy.sparse.csc_matrix((data, (row, col)), shape=(m, n))

    scipy.sparse.save_npz(feature_file, feature_vector)
Ejemplo n.º 9
0
    def __iter__(self):
        lc = LoopTimer(update_after=self.print_settings["update_after"],
                       avg_length=self.print_settings['avg_length'],
                       target=self.limit)

        for abstract_id in self.abstracts:
            path_to_annotation_file = os.path.join(self.path_to_annotations, abstract_id + ".antn")
            if not os.path.isfile(path_to_annotation_file):
                print()
                print(abstract_id + " in db but missing file.")
                print()
                continue

            with open(path_to_annotation_file, "rb") as annotation_file:
                annotation = pickle.load(annotation_file)
            if self.print_status:
                lc.update("Yield Abstract")
            if self.output is None:
                yield annotation
            elif self.output == 'all':
                yield abstract_id, annotation
Ejemplo n.º 10
0
def make_dictionaries(dtype):
    dirname = os.path.dirname(__file__)
    inter_dir = os.path.join(dirname, '../../data/interim', dtype)

    word_dic = gensim.corpora.Dictionary()
    pos_dic = gensim.corpora.Dictionary()
    lemma_dic = gensim.corpora.Dictionary()
    wordbi_dic = gensim.corpora.Dictionary()
    posbi_dic = gensim.corpora.Dictionary()

    word_corpus = corpora.word_doc_stream(dtype)
    wordbigram_corpus = corpora.wordbigram_doc_stream(dtype)
    pos_corpus = corpora.pos_doc_stream(dtype)
    posbigram_corpus = corpora.posbigram_doc_stream(dtype)
    lemma_corpus = corpora.lemma_doc_stream(dtype)

    lt = LoopTimer()
    for word_doc, wordbigram_doc, pos_doc, posbigram_doc, lemma_doc in zip(
            word_corpus, wordbigram_corpus, pos_corpus, posbigram_corpus,
            lemma_corpus):

        lemma_dic.add_documents([lemma_doc[1]], prune_at=20000000)
        word_dic.add_documents([word_doc[1]], prune_at=20000000)
        wordbi_dic.add_documents([wordbigram_doc[1]], prune_at=20000000)
        pos_dic.add_documents([pos_doc[1]], prune_at=20000000)
        posbi_dic.add_documents([posbigram_doc[1]], prune_at=20000000)

        lt.update("Build Dictionaries")

    lemma_dic.save(os.path.join(inter_dir, 'full_lemma.dict'))
    wordbi_dic.save(os.path.join(inter_dir, 'full_wordbi.dict'))
    word_dic.save(os.path.join(inter_dir, 'full_word.dict'))
    posbi_dic.save(os.path.join(inter_dir, 'full_posbi.dict'))
    pos_dic.save(os.path.join(inter_dir, 'full_pos.dict'))

    print(word_dic)
    print(wordbi_dic)
    print(pos_dic)
    print(posbi_dic)
    print(lemma_dic)
Ejemplo n.º 11
0
def make_ssorc_data():
    dirname = os.path.dirname(__file__)
    raw_dir = os.path.join(dirname, '../../data/raw/ssorc')
    file_list = sorted([
        f for f in os.listdir(raw_dir)
        if os.path.isfile(os.path.join(raw_dir, f))
    ])
    json_dir = os.path.join(dirname, '../../data/processed/ssorc/json')

    for filename in file_list[1:2]:
        cur_path = os.path.join(raw_dir, filename)

        json_file = os.path.join(json_dir, filename + '.json')
        if os.path.isfile(json_file):
            os.remove(json_file)

        print(filename)

        lt = LoopTimer()
        with open(json_file, 'a') as wfile:
            with open(cur_path) as file:
                for idx, file_line in enumerate(file):
                    data = json.loads(file_line)

                    if ('year' in data) and ('paperAbstract'
                                             in data) and ('doi' in data):
                        if (data['year'] !=
                                '') and (len(data['paperAbstract'].split()) >
                                         50) and (data['doi'] != ''):
                            if check_string_for_english(data['paperAbstract']):
                                new_data = {}
                                new_data['year'] = data['year']
                                new_data['paperAbstract'] = data[
                                    'paperAbstract']
                                new_data['id'] = data['doi']
                                jsonstring = json.JSONEncoder().encode(
                                    new_data)
                                wfile.write(jsonstring + '\n')
                    lt.update("Make Data")
Ejemplo n.º 12
0
def make_dblp_data():
    dirname = os.path.dirname(__file__)
    raw_dir = os.path.join(dirname, '../../data/raw/dblp')
    data_abstract_file = os.path.join(raw_dir, 'data_abstracts.tsv')
    data_information_file = os.path.join(raw_dir, 'data_information.tsv')

    json_dir = os.path.join(dirname, '../../data/processed/dblp/json')
    json_file = os.path.join(json_dir, 'dblp.json')
    if os.path.isfile(json_file):
        os.remove(json_file)

    with open(data_information_file) as infofile:
        with open(data_abstract_file) as abstractfile:
            with open(json_file, 'a') as jfile:
                lt = LoopTimer()
                count = 0
                for infoline, abstractline in zip(infofile, abstractfile):
                    infodata = infoline.split('\t')
                    abstractdata = abstractline.split('\t')

                    infoID = infodata[0]
                    infoDOI = infodata[1]
                    infoTitle = infodata[2]
                    infoAuthors = infodata[3]
                    infoYear = int(infodata[4])

                    abstractID = abstractdata[0]
                    abstractContent = abstractdata[1]

                    if check_string_for_english(abstractContent):
                        new_data = {}
                        new_data['year'] = infoYear
                        new_data['paperAbstract'] = abstractContent
                        new_data['id'] = infoDOI
                        jsonstring = json.JSONEncoder().encode(new_data)
                        jfile.write(jsonstring + '\n')
                        count += 1
                    lt.update(str(count) + " Abstracts added")
Ejemplo n.º 13
0
def make_deptree(mod_name, dep_tree_dict, dictionary, dep_type='basicDependencies', limit=2000):
    #dep_type = 'enhancedDependencies'
    #dep_type = 'enhancedPlusPlusDependencies'

    connection = mysql.connector.connect(
        host="localhost",
        user="******",
        passwd="thesis",
    )

    cursor = connection.cursor()
    cursor.execute("USE ssorc;")
    sq1 = f"SELECT abstract_id FROM abstracts_ml WHERE entities LIKE '%machine learning%' AND annotated=1 LIMIT {limit}"
    cursor.execute(sq1)

    print("Collecting Abstracts")
    abstracts = set()
    for row in cursor:
        abstracts.add(row[0])
    connection.close()
    print(f"{len(abstracts)} to build.")

    size = len(abstracts)

    annotations = AnnotationStream(abstracts=abstracts, output='all')

    lc = LoopTimer(update_after=10, avg_length=200, target=size)
    for abstract_id, annotation in annotations:
        for sentence in annotation['sentences']:
            dep_tree = sentence2tree(sentence, dictionary=dictionary, dep_type_=dep_type)
            sentence_id = int(sentence['index'])
            if dep_tree is not None:
                dep_tree_dict[(abstract_id, sentence_id)] = dep_tree
        lc.update("Build Dep Tree Dict")
    print()
    print(f"Size of Dictionary: {len(dictionary)}")
Ejemplo n.º 14
0
for idx, document in enumerate(corpus):
    words = document[1]
    abstract_id = document[0]

    label = abstract_labels[abstract_id]
    labels.append(label)

    bow = dictionary.doc2bow(words)
    vec_tfidf = tfidf[bow]

    for entry in vec_tfidf:
        row.append(idx)
        col.append(entry[0])
        data.append(entry[1])
    lt.update("Build Features")

m = idx + 1
n = len(dictionary)

row = np.array(row)
col = np.array(col)
data = np.array(data)

labels = np.array(labels)
feature_vector = scipy.sparse.csc_matrix((data, (row, col)), shape=(m, n))

print(feature_vector.shape)
print(labels.shape)

scipy.sparse.save_npz(path_to_feature_file, feature_vector)
Ejemplo n.º 15
0
filerange = [0, 39]
filerange[1] = min(filerange[1], 39)
blastfile = 1 if filerange[1] == 39 else 0
target = min(39, (filerange[1] - filerange[0])) * 1000000 + blastfile * 219709

key_error = 0
mass_error = 0
prune_error = 0

lt = LoopTimer(update_after=500, avg_length=1000000, target=target)
for filename in file_list[filerange[0]:filerange[1]]:
    cur_path = os.path.join(raw_dir, filename)
    with open(cur_path) as file:
        for idx, file_line in enumerate(file):
            update_string = f"Prep  - Count:{count} |  key: {key_error} - different: {mass_error} - One Char: {prune_error}"
            break_p = lt.update(update_string)
            data = json.loads(file_line)
            if not all(key in data for key in req_keys):
                key_error += 1
                continue
            title = data['title']
            abstract = data['paperAbstract']
            abstract_id = data['id']

            year = data['year']
            authors = data['authors']
            inCitations = data['inCitations']
            outCitations = data['outCitations']

            entities = data['entities']
            journal = data['journalName'].lower()
Ejemplo n.º 16
0
    print(filename)
    lt = LoopTimer(update_after=50000,
                   avg_length=50000,
                   target=len(dict_frame))
    for abstract_id in dict_frame:
        entities = dict_frame[abstract_id]

        if 'machine learning' in entities:
            ml_words.update(set(entities))
            ml_abstracts[abstract_id] = entities

        if 'artificial intelligence' in entities:
            ai_words.update(set(entities))
            ai_abstracts[abstract_id] = entities

        lt.update("Parse Dict")

    print()
    count += 1
    filename = f"all_ent_{count}.dict"

print(f"Num ml words {len(ml_words)}")
print(f"Num ai words {len(ai_words)}")
print(f"Num ml abstracts {len(ml_abstracts)}")
print(f"Num ai abstracts {len(ai_abstracts)}")

with open(os.path.join(panda_path, "ml_entities.pickle"), "wb") as ml_file:
    pickle.dump(ml_words, ml_file)
with open(os.path.join(panda_path, "ai_entities.pickle"), "wb") as ai_file:
    pickle.dump(ai_words, ai_file)
Ejemplo n.º 17
0
cursor = connection.cursor()
cursor.execute("USE ssorc;")

with open(path_to_feature_file, "rb") as feature_file:
    documents = pickle.load(feature_file)

pos = 0
neg = 0
lc = LoopTimer(update_after=50, avg_length=700, target=len(documents))
for abstract_id in documents:
    features = documents[abstract_id]
    prediction = mllr.predict_proba(features)

    if prediction[0][1] > 0.8:
        label = 1
        pos += 1
    else:
        label = -1
        neg += 1

    sql = f'UPDATE abstracts SET isML = {label} WHERE abstract_id = "{abstract_id}"'
    cursor.execute(sql)
    lc.update("Classify")

print()
print(f"POS: {pos}")
print(f"NEG: {neg}")

connection.commit()
connection.close()
Ejemplo n.º 18
0
    os.mkdir(path_to_mlgenome_features)

with open(os.path.join(path_to_mlgenome, "unique_mentions.pickle"),
          "rb") as handle:
    mentions = pickle.load(handle)

feature_vector = []

lt_target = len(mentions)
lt = LoopTimer(update_after=5000, avg_length=10000, target=lt_target)
for mention in mentions:
    m_string = mention["string"]
    m_is_acronym = mention['is_acronym']
    m_vec = mention['m_vec']

    #if m_is_acronym:
    #    continue

    feature_vector.append(m_vec)

    breaker = lt.update(f"Make Training-Set - {len(feature_vector)}")

print(len(feature_vector))

feature_dict = dict()

feature_dict["features"] = feature_vector

with open(os.path.join(path_to_mlgenome_features, "knn_features.pickle"),
          "wb") as handle:
    pickle.dump(feature_dict, handle)
Ejemplo n.º 19
0
                    journal_dict[journal] += 1

                if venue in venues:
                    if venue not in venue_dict:
                        venue_dict[venue] = 0
                    venue_dict[venue] += 1

                if venue in venues or journal in journals:
                    total_count += 1
                    if "machine learning" in entities:
                        ml_count += 1

                    if "artificial intelligence" in entities:
                        ai_count += 1

            breaker = lt.update(f"Analyze ({total_count})")


sorted_venue = sorted(venue_dict.items(), key=operator.itemgetter(1), reverse=True)
sorted_journals = sorted(journal_dict.items(), key=operator.itemgetter(1), reverse=True)

print("\n\n")
print("Venues")
for item in sorted_venue[:20]:
    print(f"{item[0]}: {item[1]}")
print()
print("------------------------")
print("Journals")
for item in sorted_journals[:20]:
    print(f"{item[0]}: {item[1]}")
Ejemplo n.º 20
0
lemma_list = list()
coarse_pos_list = list()
fine_pos_list = list()
ent_type_list = list()

merged_word_list = list()
merged_ent_type_list = list()

targ = len(infoDF)
not_found = 0
lt = LoopTimer(update_after=100, avg_length=10000, target=targ)
for abstract_id, row in infoDF.iterrows():
    file_path = os.path.join(path_to_annotations, f"{abstract_id}.spacy")
    if not os.path.isfile(file_path):
        not_found += 1
        lt.update(f"Create Pandas - {len(abstract_id_list)}")
        continue
    doc = Doc(vocab).from_disk(file_path)

    abstract_id_list.append(abstract_id)

    word_list.append("\t\t".join([
        "\t".join(
            [token.text for token in sentence if token_conditions(token)])
        for sentence in doc.sents
    ]))
    lemma_list.append("\t\t".join([
        "\t".join(
            [token.lemma_ for token in sentence if token_conditions(token)])
        for sentence in doc.sents
    ]))
Ejemplo n.º 21
0
connection = mysql.connector.connect(
            host="localhost",
            user="******",
            passwd="thesis",
        )

cursor = connection.cursor()
cursor.execute("USE ssorc;")
cursor.execute("SELECT abstract_id FROM abstracts WHERE annotated=0 and year>1990")

lc = LoopTimer(update_after=1000)
abstracts_to_process = set()
for idx, row in enumerate(cursor):
    abstracts_to_process.add(row[0])
    lc.update("Collect Abstracts to Process")
connection.close()
print()

print("There are " + str(len(abstracts_to_process)) + " files to process")

annotators = 'tokenize,ssplit,pos,lemma,depparse'
splitter_annotators = 'ssplit'
nlp = StanfordCoreNLP('../../../stanford-corenlp-full-2018-02-27')
props = {'annotators': annotators, 'pipelineLanguage': 'en', 'outputFormat': 'json'}
split_props = {'annotators': splitter_annotators, 'pipelineLanguage': 'en', 'outputFormat': 'json'}

lc = LoopTimer(update_after=1, avg_length=500)
for idx, abstract_id in enumerate(abstracts_to_process):
    lc.update("Annotate Abstract " + abstract_id)
Ejemplo n.º 22
0
            sent_info['sent_id'] = sent_id
            sent_infos.append(sent_info)

            print_string = ""

            label_count[label_dic[label_key]] += 1
            breaker = 0
            for lco in label_count:
                counting = label_count[lco]
                if counting >= label_limit:
                    breaker += 1
                print_string += f"{lco}: {counting} | "
            breaker = breaker / len(label_count)
            #print_string = print_string[:len(print_string)-3]
            print_string += f"Breaker: {breaker}"
            lc.update(f"Build AP Features | {print_string}")
            if breaker >= 1:
                break

    if breaker >= 1:
        break

for feature_data in sent_infos:
    did = feature_data['id']
    sid = feature_data['sent_id']
    label_key = (did, sid)

    target = label_dic[label_key]

    target_vector.append(target)
Ejemplo n.º 23
0
    #                gamma='auto')
    model = svm.SVC(kernel="linear", C=c_para, decision_function_shape='ovo')
    scores = cross_val_score(model,
                             learning_features,
                             learning_targets,
                             cv=10,
                             n_jobs=-1)
    mean_score = scores.mean()

    if mean_score > best_score:
        best_score = mean_score
        best_para = c_para

    score_list.append((c_para, mean_score))

    lc.update(f"Best Para: {best_para} - Best Score: {best_score}")

print()
print(score_list)
print()
print("Best Reg-Para: " + str(best_para))
print()

#best_model = svm.SVC(decision_function_shape='ovo',
#                     C=best_para,
#                     kernel='rbf',
#                     gamma='auto')
best_model = svm.SVC(kernel="linear",
                     C=best_para,
                     decision_function_shape='ovo')
best_model.fit(learning_features, learning_targets)
Ejemplo n.º 24
0
            if all(key in data for key in req_keys):
                entities = [entity.lower() for entity in data['entities']]

                journal = data['journalName'].lower()
                venue = data['venue'].lower()

                if journal not in journal_dict:
                    journal_dict[journal] = 0
                journal_dict[journal] += 1

                if venue not in venue_dict:
                    venue_dict[venue] = 0
                venue_dict[venue] += 1

            breaker = lt.update(f"Make Data")


sorted_venue = sorted(venue_dict.items(), key=operator.itemgetter(1), reverse=True)
sorted_journals = sorted(journal_dict.items(), key=operator.itemgetter(1), reverse=True)

print("\n\n")
print("Venues")
for item in sorted_venue[:20]:
    print(f"{item[0]}: {item[1]}")
print()
print("------------------------")
print("Journals")
for item in sorted_journals[:20]:
    print(f"{item[0]}: {item[1]}")
Ejemplo n.º 25
0
import os
import mysql.connector

from src.utils.LoopTimer import LoopTimer

path_to_ner = "/media/norpheo/mySQL/db/ssorc/NER"
path_to_ml_algo_abstract_save = os.path.join(path_to_ner, "ml_algo_abstract_new.pickle")


with open(path_to_ml_algo_abstract_save, "rb") as algo_abstract_file:
    ml_algo_abstract = pickle.load(algo_abstract_file)

connection = mysql.connector.connect(
            host="localhost",
            user="******",
            passwd="thesis",
        )

cursor = connection.cursor()
cursor.execute("USE ssorc;")


lc = LoopTimer(update_after=100, avg_length=10000)
for idx, abstract_id in enumerate(ml_algo_abstract):
    sq1 = f'INSERT INTO mlabstracts (abstract_id) VALUES("{abstract_id}") ON DUPLICATE KEY UPDATE abstract_id = ("{abstract_id}")'
    cursor.execute(sq1)

    lc.update("Insert Into")

connection.commit()
connection.close()
Ejemplo n.º 26
0
    for line in label_file:
        info = line.replace('\n', '').split('\t')

        if len(info) != 4:
            print(len(info))
            print(line)
            continue

        abstract_id = info[0]
        sent_id = info[1]
        label = info[3]

        if label == '0':
            continue

        if label not in label_set:
            label_set[label] = 0

        label_set[label] += 1

        label_key = (abstract_id, int(sent_id))

        targets[label_key] = label
        lc.update("Make Targets")
print()
print(f"labels: {label_set}")
print(f"Size: {len(targets)}")

with open(target_path, 'wb') as target_file:
    pickle.dump(targets, target_file, protocol=pickle.HIGHEST_PROTOCOL)
Ejemplo n.º 27
0
    for candidate in mentions:
        c_string = candidate["mention_string"]
        c_tokens = candidate["mention_tokens"]
        c_dvec = candidate["doc_vector"]
        c_svec = candidate["sentence_vector"]
        c_mvec = candidate["mention_vector"]

        similarity = mlares.similarity_ratio(m_string, c_string)
        intersection = mlares.intersection_of_words(m_tokens, c_tokens)
        is_pfix = mlares.is_prefix(m_tokens, c_tokens)
        is_ifix = mlares.is_infix(m_tokens, c_tokens)
        is_sfix = mlares.is_suffix(m_tokens, c_tokens)

        dvec_sim = mlares.vec_sim(m_dvec, c_dvec)
        svec_sim = mlares.vec_sim(m_svec, c_svec)
        mvec_sim = mlares.vec_sim(m_mvec, c_mvec)

        feat_vec = [
            similarity, intersection, is_pfix, is_ifix, is_sfix, dvec_sim,
            svec_sim, mvec_sim
        ]

        entities = (m_string, c_string)

        entity_linking[entities] = mlares_clf.predict([feat_vec])

        lt.update(f"Make Training-Set - {len(entity_linking)}")

with open(os.path.join(path_to_mlgenome, "entity_linking.pickle"),
          "wb") as el_file:
    pickle.dump(entity_linking, el_file)
Ejemplo n.º 28
0
                    ent_counter[entity_name] = {
                        "list": en_idx_list,
                        "counter": en_count
                    }

                    entity = (start_idx, end_idx, "MLALGO")
                    ent_list.append(entity)

                #print(f"Add: {entity_name}")
                #pos_string = " ".join([sentence[tid].tag_ for tid in range(start_id, end_id+1)])
                #print(f'     {pos_string}')
        if len(ent_list) > 0:
            entities = {"entities": ent_list}
            TRAIN_DATA.append((sentence.text, entities))

    breaker = lt.update(f"Make TD - {len(TRAIN_DATA)}")

    #if breaker > 100000:
    #    break

print()
for entity_name in ent_counter:
    en_count = ent_counter[entity_name]["counter"]
    en_list = ent_counter[entity_name]["list"]
    if en_count <= threshold:
        for entry in en_list:
            train_id = entry["sid"]
            ent_id = entry["entid"]

            TRAIN_DATA[train_id][1]["entities"][ent_id] = None
    else:
Ejemplo n.º 29
0
    topic_dist = tm_model.get_topic_dist(tokens)

    top_n_topics = topic_dist.argsort()[::-1][:5]

    if year not in year_count:
        year_count[year] = 0
        for topic in track_topics:
            timeseries[topic][year] = 0
    year_count[year] += 1

    for topic in track_topics:
        for topic_n in track_topics[topic]:
            if topic_n in top_n_topics:
                timeseries[topic][year] += topic_dist[topic_n]

    breaker = lc.update("Model Topics")
print()

for topic in track_topics:
    years_list = [key for key in year_count.keys() if key < 2018]
    years_list.sort()
    x = np.array(years_list)
    y = np.array([timeseries[topic][year]/year_count[year] for year in years_list])

    fig, ax = plt.subplots()
    ax.plot(x, y)

    fig.suptitle(f"{topic} - LDA", fontsize=10, y=1.00)
    start, end = ax.get_xlim()
    start = int(start)
    end = int(end)
Ejemplo n.º 30
0
                n_cat = cat_assignment.count(category)

                if n_cat > top_cat_count:
                    top_cat_count = n_cat
                    top_cat = category
            location = s_id / n_sents
            word_vector = sentence.vector

            features = np.append(word_vector, [location])

            target_vector.append(top_cat)
            feature_vector.append(features)

    info_count = [(cat, target_vector.count(cat))
                  for cat in set(target_vector)]
    breaker = lc.update(f"Make Features - {info_count}")

feature_vector = np.array(feature_vector)
target_vector = np.array(target_vector)

print(feature_vector.shape)
print(target_vector.shape)

feature_dict = dict()

feature_dict["features"] = feature_vector
feature_dict["targets"] = target_vector
feature_dict["settings"] = settings

with open(os.path.join(path_to_rfl, f"{feature_info_name}.pickle"),
          "wb") as handle: