Ejemplo n.º 1
0
    def __iter__(self):
        for filename in self.file_list[0:1]:
            sent_file = os.path.join(self.annotation_dir, filename)
            with open(sent_file) as file:
                lc = LoopTimer(update_after=100)
                abs_list = []

                lastid = None
                for line in file:
                    if self.print_status:
                        lc.update("Lemma Doc Stream")

                    data = json.loads(line)

                    doc_id = data['id']
                    xml = data['annotation']

                    if lastid != doc_id and len(abs_list) > 0:
                        # Yield Stuff
                        yield lastid, abs_list
                        abs_list = []

                    lastid = doc_id
                    token_list = mf.xml2lemmas(xml)
                    pos_list = mf.xml2pos(xml)

                    for i in range(0, len(token_list)):
                        token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i])

                        if len(token_cleaned) > 0:
                            for j in range(0, len(token_cleaned)):
                                abs_list.append(token_cleaned[j])
                if len(abs_list) > 0:
                    # Yield Stuff
                    yield lastid, abs_list
Ejemplo n.º 2
0
    def __iter__(self):
        for filename in self.file_list[0:1]:
            sent_file = os.path.join(self.annotation_dir, filename)
            with open(sent_file) as file:
                lc = LoopTimer(update_after=100)

                for line in file:
                    if self.print_status:
                        lc.update("Lemma Para Stream")

                    data = json.loads(line)

                    doc_id = data['id']
                    para_id = data['paragraphID']
                    xml = data['annotation']

                    token_list = mf.xml2lemmas(xml)
                    pos_list = mf.xml2pos(xml)

                    para_list = []
                    for i in range(0, len(token_list)):
                        token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i])

                        if len(token_cleaned) > 0:
                            for j in range(0, len(token_cleaned)):
                                para_list.append(token_cleaned[j])
                    yield doc_id, para_id, para_list
Ejemplo n.º 3
0
    def __iter__(self):
        lc = LoopTimer(update_after=self.print_settings["update_after"],
                       avg_length=self.print_settings['avg_length'],
                       target=self.limit)

        for abstract_id in self.abstracts:
            path_to_annotation_file = os.path.join(self.path_to_annotations, abstract_id + ".antn")
            if not os.path.isfile(path_to_annotation_file):
                print()
                print(abstract_id + " in db but missing file.")
                print()
                continue

            with open(path_to_annotation_file, "rb") as annotation_file:
                annotation = pickle.load(annotation_file)

            document = nlp_to_sent_token(annotation,
                                         token_type=self.token_type,
                                         clean=self.token_cleaned,
                                         lower=self.lower,
                                         bigrams=self.bigram,
                                         dictionary=self.dictionary)

            for sentence_id, sentence in enumerate(document):
                if self.print_status:
                    lc.update("Yield Sentence")
                if self.output is None:
                    yield sentence
                elif self.output == 'all':
                    yield abstract_id, sentence_id, sentence
Ejemplo n.º 4
0
    def __iter__(self):
        for filename in self.file_list[0:1]:
            sent_file = os.path.join(self.annotation_dir, filename)
            with open(sent_file) as file:
                lc = LoopTimer(update_after=100)

                lastid = None
                for line in file:
                    if self.print_status:
                        lc.update("Posbigram Sent Stream")

                    data = json.loads(line)

                    xml = data['annotation']
                    id = data['id']
                    if lastid != id:
                        para_num = 0
                    else:
                        para_num += 1
                    lastid = id

                    token_list = mf.xml2words(xml)
                    pos_list = mf.xml2pos(xml)

                    for i in range(0, len(token_list)):
                        token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i])

                        if len(token_cleaned) > 0:
                            yield id, para_num, utils.makeBigrams(pos_cleaned)
Ejemplo n.º 5
0
def findphrasesbyrules(rf_rules):
    phrases_ = []
    lc = LoopTimer(update_after=5000,
                   avg_length=10000,
                   target=len(dep_tree_dict))
    for abstract_id, sentence_id in dep_tree_dict.keys():
        dep_tree = dep_tree_dict[(abstract_id, sentence_id)]

        for rf_rule in rf_rules:
            phrases_.extend(pm.get_phrases(dep_tree, rf_rule))
        lc.update("Find Phrases By Rules")
    return phrases_
Ejemplo n.º 6
0
def paragraph_splitter(dtype):
    dirname = os.path.dirname(__file__)
    json_dir = os.path.join(dirname, '../../data/processed/' + dtype + '/json')
    file_list = sorted([
        f for f in os.listdir(json_dir)
        if os.path.isfile(os.path.join(json_dir, f)) and f.endswith('.json')
    ])

    max_para_title_words = 2  # Maximum Number of words for a line to be considered as a paragraph_title
    min_para_words = 10  # Minimum Number of words for a line to be considered as a paragraph

    for filename in file_list:
        cur_path = os.path.join(json_dir, filename)

        paragraph_file = os.path.join(json_dir, filename + '.para')
        if os.path.isfile(paragraph_file):
            os.remove(paragraph_file)

        print(filename)

        lt = LoopTimer()
        with open(paragraph_file, 'a') as wfile:
            with open(cur_path) as file:
                for file_line in file:
                    data = json.loads(file_line)
                    lines = data['paperAbstract'].split('\n')
                    paragraph_title = ''
                    paragraph_id = 0
                    new_data = {}
                    empty_line_allowed = False
                    for line in lines:
                        # Check if line machtes a Paragraph-Title-Pattern
                        if len(line) > 0 and (
                                len(line.split()) <= max_para_title_words
                                or line == line.upper()):
                            paragraph_title = line
                            empty_line_allowed = True
                        elif len(line) > 0 and (len(line.split()) >=
                                                min_para_words):
                            new_data['paragraphContent'] = line
                            new_data['year'] = data['year']
                            new_data['id'] = data['id']
                            new_data['paragraphTitle'] = paragraph_title
                            new_data['paragraphID'] = paragraph_id

                            json_string = json.JSONEncoder().encode(new_data)
                            wfile.write(json_string + '\n')

                            empty_line_allowed = False
                            paragraph_id += 1
                        elif len(line) == 0 and not empty_line_allowed:
                            paragraph_title = ''
                    lt.update_after("Para-Split")
Ejemplo n.º 7
0
def rf_label_ssorc_paragraphs():
    dirname = os.path.dirname(__file__)
    json_dir = os.path.join(dirname, '../../data/processed/ssorc/json')
    file_list = sorted([
        f for f in os.listdir(json_dir)
        if os.path.isfile(os.path.join(json_dir, f)) and f.endswith('.para')
    ])

    target_file_path = os.path.join(
        dirname, '../../data/processed/ssorc/rf_targets/targets.json')
    if os.path.isfile(target_file_path):
        os.remove(target_file_path)

    labels = dict()
    with open(os.path.join(dirname,
                           '../../data/definitions/rf_labels.txt')) as rfdef:
        for line in rfdef:
            linesplit = line.split('\t')
            label = linesplit[0]
            candidates = linesplit[1].rstrip().split(',')
            labels[label] = []
            for candidate in candidates:
                if len(candidate) > 0:
                    candidate = candidate.lower()
                    labels[label].append(candidate)
                    labels[label].append(candidate + ":")

    with open(target_file_path, 'a') as target_file:
        for filename in file_list:
            cur_path = os.path.join(json_dir, filename)

            lt = LoopTimer()
            with open(cur_path) as file:
                for idx, file_line in enumerate(file):
                    data = json.loads(file_line)
                    title = data['paragraphTitle'].lower()

                    rf_label = -1
                    for key in labels:
                        if title in labels[key]:
                            rf_label = key
                            break

                    if rf_label != -1:
                        target_data = {}
                        target_data['id'] = data['id']
                        target_data['paragraphID'] = data['paragraphID']
                        target_data['rflabel'] = rf_label

                        json_string = json.JSONEncoder().encode(target_data)
                        target_file.write(json_string + '\n')
                    lt.update("RF Labeling")
Ejemplo n.º 8
0
    def __iter__(self):
        for filename in self.file_list[0:1]:
            sent_file = os.path.join(self.annotation_dir, filename)
            with open(sent_file) as file:
                lc = LoopTimer(update_after=100)

                for line in file:
                    if self.print_status:
                        lc.update("XML Para Stream")

                    data = json.loads(line)

                    xml = data['annotation']
                    doc_id = data['id']
                    para_num = data['paragraphID']

                    if xml.startswith('<?xml'):
                        yield doc_id, para_num, xml
Ejemplo n.º 9
0
def build_scipy_feature_file(dtype, prefix='', num_samples=10000):
    dirname = os.path.dirname(__file__)
    dictionary_dir = os.path.join(dirname, '../../data/processed/' + dtype + '/dictionaries')
    tfidf_dir = os.path.join(dirname, '../../data/processed/' + dtype + '/tfidf')
    feature_file = os.path.join(dirname, '../../data/processed/' + dtype + '/features/tm_features_' + prefix + '.npz')

    if os.path.isfile(feature_file):
        os.remove(feature_file)

    lemma_dic = gensim.corpora.Dictionary.load(os.path.join(dictionary_dir, 'lemma.dic'))

    lemma_tfidf = gensim.models.TfidfModel.load(os.path.join(tfidf_dir, 'lemma_model.tfidf'))

    lemma_corpus = corpora.lemma_doc_stream(dtype)

    lt = LoopTimer()

    row = []
    col = []
    data = []
    for idx, lemmas in enumerate(lemma_corpus):
        if num_samples != -1 and idx == num_samples:
            break

        lemma_bow = lemma_dic.doc2bow(lemmas[1])
        vec_lemma_tfidf = lemma_tfidf[lemma_bow]

        for entry in vec_lemma_tfidf:
            row.append(idx)
            col.append(entry[0])
            data.append(entry[1])
        lt.update("Build Features")

    m = idx + 1
    n = len(lemma_dic)

    row = np.array(row)
    col = np.array(col)
    data = np.array(data)

    feature_vector = scipy.sparse.csc_matrix((data, (row, col)), shape=(m, n))

    scipy.sparse.save_npz(feature_file, feature_vector)
Ejemplo n.º 10
0
    def __iter__(self):
        lc = LoopTimer(update_after=self.print_settings["update_after"],
                       avg_length=self.print_settings['avg_length'],
                       target=self.limit)

        for abstract_id in self.abstracts:
            path_to_annotation_file = os.path.join(self.path_to_annotations, abstract_id + ".antn")
            if not os.path.isfile(path_to_annotation_file):
                print()
                print(abstract_id + " in db but missing file.")
                print()
                continue

            with open(path_to_annotation_file, "rb") as annotation_file:
                annotation = pickle.load(annotation_file)
            if self.print_status:
                lc.update("Yield Abstract")
            if self.output is None:
                yield annotation
            elif self.output == 'all':
                yield abstract_id, annotation
Ejemplo n.º 11
0
def make_dictionaries(dtype):
    dirname = os.path.dirname(__file__)
    inter_dir = os.path.join(dirname, '../../data/interim', dtype)

    word_dic = gensim.corpora.Dictionary()
    pos_dic = gensim.corpora.Dictionary()
    lemma_dic = gensim.corpora.Dictionary()
    wordbi_dic = gensim.corpora.Dictionary()
    posbi_dic = gensim.corpora.Dictionary()

    word_corpus = corpora.word_doc_stream(dtype)
    wordbigram_corpus = corpora.wordbigram_doc_stream(dtype)
    pos_corpus = corpora.pos_doc_stream(dtype)
    posbigram_corpus = corpora.posbigram_doc_stream(dtype)
    lemma_corpus = corpora.lemma_doc_stream(dtype)

    lt = LoopTimer()
    for word_doc, wordbigram_doc, pos_doc, posbigram_doc, lemma_doc in zip(
            word_corpus, wordbigram_corpus, pos_corpus, posbigram_corpus,
            lemma_corpus):

        lemma_dic.add_documents([lemma_doc[1]], prune_at=20000000)
        word_dic.add_documents([word_doc[1]], prune_at=20000000)
        wordbi_dic.add_documents([wordbigram_doc[1]], prune_at=20000000)
        pos_dic.add_documents([pos_doc[1]], prune_at=20000000)
        posbi_dic.add_documents([posbigram_doc[1]], prune_at=20000000)

        lt.update("Build Dictionaries")

    lemma_dic.save(os.path.join(inter_dir, 'full_lemma.dict'))
    wordbi_dic.save(os.path.join(inter_dir, 'full_wordbi.dict'))
    word_dic.save(os.path.join(inter_dir, 'full_word.dict'))
    posbi_dic.save(os.path.join(inter_dir, 'full_posbi.dict'))
    pos_dic.save(os.path.join(inter_dir, 'full_pos.dict'))

    print(word_dic)
    print(wordbi_dic)
    print(pos_dic)
    print(posbi_dic)
    print(lemma_dic)
Ejemplo n.º 12
0
def make_ssorc_data():
    dirname = os.path.dirname(__file__)
    raw_dir = os.path.join(dirname, '../../data/raw/ssorc')
    file_list = sorted([
        f for f in os.listdir(raw_dir)
        if os.path.isfile(os.path.join(raw_dir, f))
    ])
    json_dir = os.path.join(dirname, '../../data/processed/ssorc/json')

    for filename in file_list[1:2]:
        cur_path = os.path.join(raw_dir, filename)

        json_file = os.path.join(json_dir, filename + '.json')
        if os.path.isfile(json_file):
            os.remove(json_file)

        print(filename)

        lt = LoopTimer()
        with open(json_file, 'a') as wfile:
            with open(cur_path) as file:
                for idx, file_line in enumerate(file):
                    data = json.loads(file_line)

                    if ('year' in data) and ('paperAbstract'
                                             in data) and ('doi' in data):
                        if (data['year'] !=
                                '') and (len(data['paperAbstract'].split()) >
                                         50) and (data['doi'] != ''):
                            if check_string_for_english(data['paperAbstract']):
                                new_data = {}
                                new_data['year'] = data['year']
                                new_data['paperAbstract'] = data[
                                    'paperAbstract']
                                new_data['id'] = data['doi']
                                jsonstring = json.JSONEncoder().encode(
                                    new_data)
                                wfile.write(jsonstring + '\n')
                    lt.update("Make Data")
Ejemplo n.º 13
0
def make_dblp_data():
    dirname = os.path.dirname(__file__)
    raw_dir = os.path.join(dirname, '../../data/raw/dblp')
    data_abstract_file = os.path.join(raw_dir, 'data_abstracts.tsv')
    data_information_file = os.path.join(raw_dir, 'data_information.tsv')

    json_dir = os.path.join(dirname, '../../data/processed/dblp/json')
    json_file = os.path.join(json_dir, 'dblp.json')
    if os.path.isfile(json_file):
        os.remove(json_file)

    with open(data_information_file) as infofile:
        with open(data_abstract_file) as abstractfile:
            with open(json_file, 'a') as jfile:
                lt = LoopTimer()
                count = 0
                for infoline, abstractline in zip(infofile, abstractfile):
                    infodata = infoline.split('\t')
                    abstractdata = abstractline.split('\t')

                    infoID = infodata[0]
                    infoDOI = infodata[1]
                    infoTitle = infodata[2]
                    infoAuthors = infodata[3]
                    infoYear = int(infodata[4])

                    abstractID = abstractdata[0]
                    abstractContent = abstractdata[1]

                    if check_string_for_english(abstractContent):
                        new_data = {}
                        new_data['year'] = infoYear
                        new_data['paperAbstract'] = abstractContent
                        new_data['id'] = infoDOI
                        jsonstring = json.JSONEncoder().encode(new_data)
                        jfile.write(jsonstring + '\n')
                        count += 1
                    lt.update(str(count) + " Abstracts added")
Ejemplo n.º 14
0
def make_deptree(mod_name, dep_tree_dict, dictionary, dep_type='basicDependencies', limit=2000):
    #dep_type = 'enhancedDependencies'
    #dep_type = 'enhancedPlusPlusDependencies'

    connection = mysql.connector.connect(
        host="localhost",
        user="******",
        passwd="thesis",
    )

    cursor = connection.cursor()
    cursor.execute("USE ssorc;")
    sq1 = f"SELECT abstract_id FROM abstracts_ml WHERE entities LIKE '%machine learning%' AND annotated=1 LIMIT {limit}"
    cursor.execute(sq1)

    print("Collecting Abstracts")
    abstracts = set()
    for row in cursor:
        abstracts.add(row[0])
    connection.close()
    print(f"{len(abstracts)} to build.")

    size = len(abstracts)

    annotations = AnnotationStream(abstracts=abstracts, output='all')

    lc = LoopTimer(update_after=10, avg_length=200, target=size)
    for abstract_id, annotation in annotations:
        for sentence in annotation['sentences']:
            dep_tree = sentence2tree(sentence, dictionary=dictionary, dep_type_=dep_type)
            sentence_id = int(sentence['index'])
            if dep_tree is not None:
                dep_tree_dict[(abstract_id, sentence_id)] = dep_tree
        lc.update("Build Dep Tree Dict")
    print()
    print(f"Size of Dictionary: {len(dictionary)}")
Ejemplo n.º 15
0
abstracts = set()
abstract_labels = dict()
for idx, row in enumerate(cursor):
    abstract_id = row[0]
    abstract_label = row[1]

    abstracts.add(abstract_id)
    abstract_labels[abstract_id] = abstract_label
connection.close()

corpus = corpora.TokenDocStream(abstracts=abstracts, token_type=token_type, print_status=True, output='all', lower=True)
row = []
col = []
data = []
lt = LoopTimer()

labels = list()

for idx, document in enumerate(corpus):
    words = document[1]
    abstract_id = document[0]

    label = abstract_labels[abstract_id]
    labels.append(label)

    bow = dictionary.doc2bow(words)
    vec_tfidf = tfidf[bow]

    for entry in vec_tfidf:
        row.append(idx)
Ejemplo n.º 16
0
    vector_len += posbigram_vec_len


with open(target_path, 'rb') as target_file:
    label_dic = pickle.load(target_file)

label_count = dict()
for lkey in label_dic:
    label = label_dic[lkey]
    if label not in label_count:
        label_count[label] = 0

label_limit = limit / len(label_count)

last_abstract_id = None
lc = LoopTimer(update_after=5, avg_length=1000)
sent_infos = list()
max_sent = 0
breaker = 0

for abstract_id, row in df.iterrows():
    word_sentence_tokens = [sentence.split(" ") for sentence in row['word'].split("\t")]
    pos_sentence_tokens = [sentence.split(" ") for sentence in row['pos'].split("\t")]

    for sent_id, (word_tokens, pos_tokens) in enumerate(zip(word_sentence_tokens, pos_sentence_tokens)):

        if (last_abstract_id is not None) and (last_abstract_id != abstract_id):
            for feature_data in sent_infos:
                did = feature_data['id']
                sid = feature_data['sent_id']
                label_key = (did, sid)
Ejemplo n.º 17
0
panda_path = "/media/norpheo/mySQL/db/ssorc/pandas"

ml_words = set()
ai_words = set()

ml_abstracts = dict()
ai_abstracts = dict()

count = 0
filename = f"all_ent_{count}.dict"
while os.path.isfile(os.path.join(panda_path, filename)):
    with open(os.path.join(panda_path, filename), "rb") as dict_file:
        dict_frame = pickle.load(dict_file)
    print(filename)
    lt = LoopTimer(update_after=50000,
                   avg_length=50000,
                   target=len(dict_frame))
    for abstract_id in dict_frame:
        entities = dict_frame[abstract_id]

        if 'machine learning' in entities:
            ml_words.update(set(entities))
            ml_abstracts[abstract_id] = entities

        if 'artificial intelligence' in entities:
            ai_words.update(set(entities))
            ai_abstracts[abstract_id] = entities

        lt.update("Parse Dict")

    print()
Ejemplo n.º 18
0
                                            suffix='.pickle')

connection = mysql.connector.connect(
    host="localhost",
    user="******",
    passwd="thesis",
)
cursor = connection.cursor()
cursor.execute("USE ssorc;")

with open(path_to_feature_file, "rb") as feature_file:
    documents = pickle.load(feature_file)

pos = 0
neg = 0
lc = LoopTimer(update_after=50, avg_length=700, target=len(documents))
for abstract_id in documents:
    features = documents[abstract_id]
    prediction = mllr.predict_proba(features)

    if prediction[0][1] > 0.8:
        label = 1
        pos += 1
    else:
        label = -1
        neg += 1

    sql = f'UPDATE abstracts SET isML = {label} WHERE abstract_id = "{abstract_id}"'
    cursor.execute(sql)
    lc.update("Classify")
Ejemplo n.º 19
0
path_to_mlgenome = os.path.join(path_to_db, "mlgenome", nlp_model)
path_to_mlgenome_features = os.path.join(path_to_mlgenome, "features")

if not os.path.isdir(path_to_mlgenome_features):
    print(f"Create Directory {path_to_mlgenome_features}")
    os.mkdir(path_to_mlgenome_features)

with open(os.path.join(path_to_mlgenome, "unique_mentions.pickle"),
          "rb") as handle:
    mentions = pickle.load(handle)

feature_vector = []

lt_target = len(mentions)
lt = LoopTimer(update_after=5000, avg_length=10000, target=lt_target)
for mention in mentions:
    m_string = mention["string"]
    m_is_acronym = mention['is_acronym']
    m_vec = mention['m_vec']

    #if m_is_acronym:
    #    continue

    feature_vector.append(m_vec)

    breaker = lt.update(f"Make Training-Set - {len(feature_vector)}")

print(len(feature_vector))

feature_dict = dict()
Ejemplo n.º 20
0
          "icdm",
          "kdd",
          "uai",
          "cvpr",
          "iclr",
          "wsdm",
          "aistats"]

journal_dict = dict()
venue_dict = dict()

total_count = 0
ml_count = 0
ai_count = 0

lt = LoopTimer(update_after=50000, avg_length=500000, target=39*1000000+219709)
for filename in file_list[0:]:
    cur_path = os.path.join(raw_dir, filename)
    print()
    print(filename)
    dictFrame = dict()
    with open(cur_path) as file:
        for idx, file_line in enumerate(file):
            data = json.loads(file_line)

            if all(key in data for key in req_keys):
                entities = [entity.lower() for entity in data['entities']]

                journal = data['journalName'].lower()
                venue = data['venue'].lower()
Ejemplo n.º 21
0
infoDF = pd.read_pickle(os.path.join(path_to_annotations, 'info_db.pandas'))

abstract_id_list = list()
word_list = list()
lemma_list = list()
coarse_pos_list = list()
fine_pos_list = list()
ent_type_list = list()

merged_word_list = list()
merged_ent_type_list = list()

targ = len(infoDF)
not_found = 0
lt = LoopTimer(update_after=100, avg_length=10000, target=targ)
for abstract_id, row in infoDF.iterrows():
    file_path = os.path.join(path_to_annotations, f"{abstract_id}.spacy")
    if not os.path.isfile(file_path):
        not_found += 1
        lt.update(f"Create Pandas - {len(abstract_id_list)}")
        continue
    doc = Doc(vocab).from_disk(file_path)

    abstract_id_list.append(abstract_id)

    word_list.append("\t\t".join([
        "\t".join(
            [token.text for token in sentence if token_conditions(token)])
        for sentence in doc.sents
    ]))
Ejemplo n.º 22
0
dbcon = DBConnector(db="ssorc")
path_to_db = "/media/norpheo/mySQL/db/ssorc"
path_to_annotations = os.path.join(path_to_db, "annotations")
path_to_raw = os.path.join(path_to_db, "raw")

connection = mysql.connector.connect(
            host="localhost",
            user="******",
            passwd="thesis",
        )

cursor = connection.cursor()
cursor.execute("USE ssorc;")
cursor.execute("SELECT abstract_id FROM abstracts WHERE annotated=0 and year>1990")

lc = LoopTimer(update_after=1000)
abstracts_to_process = set()
for idx, row in enumerate(cursor):
    abstracts_to_process.add(row[0])
    lc.update("Collect Abstracts to Process")
connection.close()
print()

print("There are " + str(len(abstracts_to_process)) + " files to process")

annotators = 'tokenize,ssplit,pos,lemma,depparse'
splitter_annotators = 'ssplit'
nlp = StanfordCoreNLP('../../../stanford-corenlp-full-2018-02-27')
props = {'annotators': annotators, 'pipelineLanguage': 'en', 'outputFormat': 'json'}
split_props = {'annotators': splitter_annotators, 'pipelineLanguage': 'en', 'outputFormat': 'json'}
Ejemplo n.º 23
0
    feature_dict = pickle.load(feature_file)

all_features = feature_dict["features"]
all_targets = feature_dict["targets"]

print("Feature-Vector-Shape: " + str(all_features.shape))

learning_features, holdback_features, learning_targets, holdback_targets = train_test_split(
    all_features, all_targets, test_size=0.4, random_state=4, shuffle=True)
best_para = 0
best_score = 0

score_list = list()

print("Start Training:")
lc = LoopTimer(update_after=1, avg_length=5, target=len(reg_paras))
for c_para in reg_paras:
    #model = svm.SVC(decision_function_shape='ovo',
    #                C=c_para,
    #                kernel='rbf',
    #                gamma='auto')
    model = svm.SVC(kernel="linear", C=c_para, decision_function_shape='ovo')
    scores = cross_val_score(model,
                             learning_features,
                             learning_targets,
                             cv=10,
                             n_jobs=-1)
    mean_score = scores.mean()

    if mean_score > best_score:
        best_score = mean_score
Ejemplo n.º 24
0
req_keys = ['title',
            'authors',
            'inCitations',
            'outCitations',
            'year',
            'paperAbstract',
            'id',
            'entities',
            'journalName',
            'venue']

journal_dict = dict()
venue_dict = dict()

lt = LoopTimer(update_after=50000, avg_length=500000, target=39*1000000+219709)
for filename in file_list[0:]:
    cur_path = os.path.join(raw_dir, filename)
    print()
    print(filename)
    dictFrame = dict()
    with open(cur_path) as file:
        for idx, file_line in enumerate(file):
            data = json.loads(file_line)

            if all(key in data for key in req_keys):
                entities = [entity.lower() for entity in data['entities']]

                journal = data['journalName'].lower()
                venue = data['venue'].lower()
Ejemplo n.º 25
0
import os
import mysql.connector

from src.utils.LoopTimer import LoopTimer

path_to_ner = "/media/norpheo/mySQL/db/ssorc/NER"
path_to_ml_algo_abstract_save = os.path.join(path_to_ner, "ml_algo_abstract_new.pickle")


with open(path_to_ml_algo_abstract_save, "rb") as algo_abstract_file:
    ml_algo_abstract = pickle.load(algo_abstract_file)

connection = mysql.connector.connect(
            host="localhost",
            user="******",
            passwd="thesis",
        )

cursor = connection.cursor()
cursor.execute("USE ssorc;")


lc = LoopTimer(update_after=100, avg_length=10000)
for idx, abstract_id in enumerate(ml_algo_abstract):
    sq1 = f'INSERT INTO mlabstracts (abstract_id) VALUES("{abstract_id}") ON DUPLICATE KEY UPDATE abstract_id = ("{abstract_id}")'
    cursor.execute(sq1)

    lc.update("Insert Into")

connection.commit()
connection.close()
Ejemplo n.º 26
0
import os
import pickle

from src.utils.LoopTimer import LoopTimer

path_to_db = "/media/norpheo/mySQL/db/ssorc"
target_path = os.path.join(path_to_db, 'features',
                           'rf_targets_hl_sanity.pickle')
label_path = os.path.join(path_to_db, 'rf_hand_labels', 'sanity_data.csv')

targets = dict()
label_set = dict()
lc = LoopTimer(update_after=1000, avg_length=5000)
with open(label_path, 'r') as label_file:
    for line in label_file:
        info = line.replace('\n', '').split('\t')

        if len(info) != 4:
            print(len(info))
            print(line)
            continue

        abstract_id = info[0]
        sent_id = info[1]
        label = info[3]

        if label == '0':
            continue

        if label not in label_set:
            label_set[label] = 0
Ejemplo n.º 27
0
with open(os.path.join(path_to_mlgenome, "svm_mlares.pickle"),
          "rb") as model_file:
    classifier = pickle.load(model_file)

with open(os.path.join(path_to_mlgenome, "mentions.pickle"), "rb") as handle:
    mentions = pickle.load(handle)

mlares_clf = classifier["model"]

lt_target = len(mentions) * len(mentions)
print(f"Mentions*Mentions: {round(lt_target/1000000000, 3)} Billion")

entity_linking = dict()

breaker = 0
lt = LoopTimer(update_after=5000, avg_length=10000, target=lt_target)
for mention in mentions:
    m_string = mention["mention_string"]
    m_tokens = mention["mention_tokens"]
    m_dvec = mention["doc_vector"]
    m_svec = mention["sentence_vector"]
    m_mvec = mention["mention_vector"]

    for candidate in mentions:
        c_string = candidate["mention_string"]
        c_tokens = candidate["mention_tokens"]
        c_dvec = candidate["doc_vector"]
        c_svec = candidate["sentence_vector"]
        c_mvec = candidate["mention_vector"]

        similarity = mlares.similarity_ratio(m_string, c_string)
Ejemplo n.º 28
0
ent_counter = dict()

#forbidden_dep = ['csubj', 'nummod', 'cc', 'advmod', 'preconj', 'attr', 'det']
#forbidden_pos = ['VERB', 'ADP']

forbidden_dep = ['det', 'predet', 'nummod', 'cc', 'appos', 'punct', 'conj']
forbidden_pos = ['ADP', 'VERB', 'X', 'ADV']

forbidden_substrings = [
    'state-of-the-art', ',', '(', ')', "approaches", "approach", "algorithm",
    "algorithms", "based", "function", "functions", "other", "large", "larger",
    "twitter", "such"
]

collect_ml = set()
lt = LoopTimer(update_after=200, avg_length=2000, target=targ)
for abstract_id, row in infoDF.iterrows():
    ori_doc = Doc(vocab).from_disk(
        os.path.join(path_to_annotations, f"{abstract_id}.spacy"))

    for sent in ori_doc.sents:
        sentence = sent.as_doc()

        matches = matcher(sentence)

        ent_list = list()

        for match in matches:
            start = match[1]
            end = match[2]
Ejemplo n.º 29
0
track_topics = {"SVM": [166],
                "NeuralNetwork": [50, 114, 140]
                }

tm_model = TopicModelingLDA(info_fn=tm_info_file_name)

wordDF = pd.read_pickle(os.path.join(path_to_db, "pandas", "aiml_ner_merged_word.pandas"))
infoDF = pd.read_pickle(os.path.join(path_to_db, "pandas", "ner_info_db.pandas"))
df = infoDF.join(wordDF)

timeseries = dict()
for topic in track_topics:
    timeseries[topic] = dict()
year_count = dict()
lc = LoopTimer(update_after=100, avg_length=5000, target=len(df))
for abstract_id, row in df.iterrows():

    year = row['year']
    text = row['merged_word'].replace("\t\t", "\t")
    tokens = row['merged_word'].split("\t")

    topic_dist = tm_model.get_topic_dist(tokens)

    top_n_topics = topic_dist.argsort()[::-1][:5]

    if year not in year_count:
        year_count[year] = 0
        for topic in track_topics:
            timeseries[topic][year] = 0
    year_count[year] += 1
Ejemplo n.º 30
0
        dependency = data['dependency']

        rule = (trigger_word, dependency)

        if category not in rules:
            rules[category] = set()

        rules[category].add(rule)

print("Loading Vocab...")
vocab = Vocab().from_disk(os.path.join(path_to_annotations, "spacy.vocab"))
infoDF = pd.read_pickle(os.path.join(path_to_annotations, 'info_db.pandas'))

db_size = len(infoDF)

lc = LoopTimer(update_after=100, avg_length=200, target=db_size)

predictions = dict()
targets = dict()

target_vector = list()
feature_vector = list()

for idx, (abstract_id, df_row) in enumerate(infoDF.iterrows()):

    file_path = os.path.join(path_to_annotations, f"{abstract_id}.spacy")
    doc = Doc(vocab).from_disk(file_path)

    for n_sents, sent in enumerate(doc.sents):
        pass