Ejemplo n.º 1
0
    def __iter__(self):
        for filename in self.file_list[0:1]:
            sent_file = os.path.join(self.annotation_dir, filename)
            with open(sent_file) as file:
                lc = LoopTimer(update_after=100)

                lastid = None
                for line in file:
                    if self.print_status:
                        lc.update("Posbigram Sent Stream")

                    data = json.loads(line)

                    xml = data['annotation']
                    id = data['id']
                    if lastid != id:
                        para_num = 0
                    else:
                        para_num += 1
                    lastid = id

                    token_list = mf.xml2words(xml)
                    pos_list = mf.xml2pos(xml)

                    for i in range(0, len(token_list)):
                        token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i])

                        if len(token_cleaned) > 0:
                            yield id, para_num, utils.makeBigrams(pos_cleaned)
Ejemplo n.º 2
0
def nlp_to_doc_token(annotation, token_type, clean=True, lower=False, bigrams=False, dictionary=None):
    sentences = annotation['sentences']
    abs_list = list()

    for sentence in sentences:
        pos_list = list()
        token_list = list()

        for token in sentence['tokens']:
            pos_list.append(token['pos'])
            # oText = token['originalText']
            if lower:
                token_list.append(token[token_type].lower())
            else:
                token_list.append(token[token_type])

        if clean:
            token_list, pos_cleaned = utils.posFilterString(token_list, pos_list)

        if dictionary is not None:
            token_list = [word for word in token_list if word in dictionary.token2id]

        if bigrams:
            token_list = utils.makeBigrams(token_list)

        abs_list.extend(token_list)

    return abs_list
Ejemplo n.º 3
0
    def __iter__(self):
        for filename in self.file_list[0:1]:
            sent_file = os.path.join(self.annotation_dir, filename)
            with open(sent_file) as file:
                lc = LoopTimer(update_after=100)
                abs_list = []

                lastid = None
                for line in file:
                    if self.print_status:
                        lc.update("Posbigram Doc Stream")

                    data = json.loads(line)

                    doc_id = data['id']
                    xml = data['annotation']

                    if lastid != doc_id and len(abs_list) > 0:
                        # Yield Stuff
                        yield lastid, abs_list
                        abs_list = []

                    lastid = doc_id
                    token_list = mf.xml2words(xml)
                    pos_list = mf.xml2pos(xml)

                    for i in range(0, len(token_list)):
                        token_cleaned, pos_cleaned = utils.posFilterString(token_list[i], pos_list[i])

                        pos_cleaned = utils.makeBigrams(pos_cleaned)

                        if len(pos_cleaned) > 0:
                            for j in range(0, len(pos_cleaned)):
                                abs_list.append(pos_cleaned[j])
                if len(abs_list) > 0:
                    # Yield Stuff
                    yield doc_id, abs_list
Ejemplo n.º 4
0
    def get_feature_vector(self, word_corpus, pos_corpus):

        sent_infos = list()
        max_sent = 0
        sent_id = 0

        for words, pos in zip(word_corpus, pos_corpus):
            sent_id += 1
            max_sent += 1

            wordbigram = makeBigrams(words)
            posbigram = makeBigrams(pos)

            word_bow = self.word_dic.doc2bow(words)
            vec_word_tfidf = self.word_tfidf[word_bow]
            wordbigram_bow = self.wordbigram_dic.doc2bow(wordbigram)
            vec_wordbigram_tfidf = self.wordbigram_tfidf[wordbigram_bow]

            pos_bow = self.pos_dic.doc2bow(pos)
            vec_pos_tfidf = self.pos_tfidf[pos_bow]

            posbigram_bow = self.posbigram_dic.doc2bow(posbigram)
            vec_posbigram_tfidf = self.posbigram_tfidf[posbigram_bow]

            # Collecting Concreteness-Ratings
            cr_min = 1000
            cr_max = 0
            cr_mean = 0

            cr_words = [
                cr_word for cr_word in words if cr_word in self.conc_rating
            ]

            for word in cr_words:
                rating = self.conc_rating[word]
                cr_mean += rating
                if rating > cr_max:
                    cr_max = rating
                if rating < cr_min:
                    cr_min = rating

            if cr_min > cr_max:
                cr_max_feature = 0
                cr_min_feature = 0
                cr_mean_feature = 0
            else:
                cr_mean = cr_mean / len(cr_words)
                cr_max_feature = cr_max
                cr_min_feature = cr_min
                cr_mean_feature = cr_mean

            sent_info = dict()
            sent_info['cr_max_feature'] = cr_max_feature
            sent_info['cr_min_feature'] = cr_min_feature
            sent_info['cr_mean_feature'] = cr_mean_feature
            sent_info['vec_word_tfidf'] = vec_word_tfidf
            sent_info['vec_wordbigram_tfidf'] = vec_wordbigram_tfidf
            sent_info['vec_pos_tfidf'] = vec_pos_tfidf
            sent_info['vec_posbigram_tfidf'] = vec_posbigram_tfidf
            sent_info['sent_id'] = sent_id
            sent_infos.append(sent_info)

        feature_data_array = []
        feature_row = []
        feature_col = []

        row_count = 0

        for feature_data in sent_infos:
            sid = feature_data['sent_id']

            vector_offset = 0

            if 'location' in self.feature_set:
                feature_row.append(row_count)
                feature_col.append(vector_offset)
                feature_data_array.append(sid / max_sent)

                vector_offset += 1

            if 'concreteness' in self.feature_set:
                cr_max_feature = float(feature_data['cr_max_feature'])
                cr_min_feature = float(feature_data['cr_min_feature'])
                cr_mean_feature = float(feature_data['cr_mean_feature'])

                feature_row.append(row_count)
                feature_col.append(vector_offset)
                feature_data_array.append(cr_max_feature)

                feature_row.append(row_count)
                feature_col.append(vector_offset + 1)
                feature_data_array.append(cr_min_feature)

                feature_row.append(row_count)
                feature_col.append(vector_offset + 2)
                feature_data_array.append(cr_mean_feature)
                vector_offset += 3

            if 'wordunigram' in self.feature_set:
                append_vec2data(feature_data['vec_word_tfidf'],
                                feature_data_array, feature_row, feature_col,
                                row_count, vector_offset)
                vector_offset += self.word_vec_len

            if 'wordbigram' in self.feature_set:
                append_vec2data(feature_data['vec_wordbigram_tfidf'],
                                feature_data_array, feature_row, feature_col,
                                row_count, vector_offset)
                vector_offset += self.wordbigram_vec_len

            if 'posunigram' in self.feature_set:
                append_vec2data(feature_data['vec_pos_tfidf'],
                                feature_data_array, feature_row, feature_col,
                                row_count, vector_offset)
                vector_offset += self.pos_vec_len

            if 'posbigram' in self.feature_set:
                append_vec2data(feature_data['vec_posbigram_tfidf'],
                                feature_data_array, feature_row, feature_col,
                                row_count, vector_offset)
                vector_offset += self.posbigram_vec_len

            row_count += 1

        feature_row = np.array(feature_row)
        feature_col = np.array(feature_col)
        feature_data_array = np.array(feature_data_array)

        feature_vector = scipy.sparse.csc_matrix(
            (feature_data_array, (feature_row, feature_col)),
            shape=(row_count, self.vector_len))

        return feature_vector
Ejemplo n.º 5
0
df = wordDF.join(lemmaDF).join(fineposDF).join(coarseposDF).join(
    mergedwordDF)  # .join(wordlowermergedDF)

for i in range(len(token_types)):
    token_type = token_types[i]
    dic_path = dic_paths[i]

    is_bigram = False
    if "bigram" in token_type:
        is_bigram = True
        token_type = token_type[:-6]

    corpus = list()

    print(f"Build Corpus for {token_type} - Bigram: {is_bigram}")
    for abstract_id, row in df.iterrows():
        token_string = row[token_type]
        tokens = token_string.replace("\t\t", "\t").split("\t")
        if is_bigram:
            tokens = makeBigrams(tokens)

        corpus.append(tokens)

    print("Build Dictionary")
    dictionary = gensim.corpora.Dictionary()
    dictionary.add_documents(corpus, prune_at=None)

    print("Save Dictionary")
    dictionary.save(dic_path)
    print(dictionary)
Ejemplo n.º 6
0
                                          feature_col,
                                          row_count,
                                          vector_offset)
                    vector_offset += posbigram_vec_len

                row_count += 1

            max_sent = 0
            sent_infos.clear()

        last_abstract_id = abstract_id
        max_sent += 1
        label_key = (abstract_id, sent_id)

        if (label_key in label_dic) and (label_count[label_dic[label_key]] < label_limit):
            wordbigram = utils.makeBigrams(word_tokens)
            posbigram = utils.makeBigrams(pos_tokens)

            word_bow = word_dic.doc2bow(word_tokens)
            vec_word_tfidf = word_tfidf[word_bow]
            wordbigram_bow = wordbigram_dic.doc2bow(wordbigram)
            vec_wordbigram_tfidf = wordbigram_tfidf[wordbigram_bow]

            pos_bow = pos_dic.doc2bow(pos_tokens)
            vec_pos_tfidf = pos_tfidf[pos_bow]

            posbigram_bow = posbigram_dic.doc2bow(posbigram)
            vec_posbigram_tfidf = posbigram_tfidf[posbigram_bow]

            # Collecting Concreteness-Ratings
            cr_min = 1000
Ejemplo n.º 7
0
def build_feature_file(dtype):
    dictionary_dir = os.path.join(
        dirname, '../../data/processed/' + dtype + '/dictionaries')
    tfidf_dir = os.path.join(dirname,
                             '../../data/processed/' + dtype + '/tfidf')
    feature_file = os.path.join(
        dirname,
        '../../data/processed/' + dtype + '/features/ap_features.json')

    if os.path.isfile(feature_file):
        os.remove(feature_file)

    word_dic = gensim.corpora.Dictionary.load(
        os.path.join(dictionary_dir, 'word.dic'))
    wordbigram_dic = gensim.corpora.Dictionary.load(
        os.path.join(dictionary_dir, 'wordbigram.dic'))
    pos_dic = gensim.corpora.Dictionary.load(
        os.path.join(dictionary_dir, 'pos.dic'))
    posbigram_dic = gensim.corpora.Dictionary.load(
        os.path.join(dictionary_dir, 'posbigram.dic'))

    word_tfidf = gensim.models.TfidfModel.load(
        os.path.join(tfidf_dir, 'words_model.tfidf'))
    wordbigram_tfidf = gensim.models.TfidfModel.load(
        os.path.join(tfidf_dir, 'wordbigrams_model.tfidf'))
    pos_tfidf = gensim.models.TfidfModel.load(
        os.path.join(tfidf_dir, 'pos_model.tfidf'))
    posbigram_tfidf = gensim.models.TfidfModel.load(
        os.path.join(tfidf_dir, 'posbigrams_model.tfidf'))

    conc_rating = load_concratings()

    word_corpus = corpora.word_sent_stream(dtype)
    pos_corpus = corpora.pos_sent_stream(dtype)

    with open(feature_file, "a") as featfile:
        information = {}
        information['word_vec_len'] = len(word_dic)
        information['wordbigram_vec_len'] = len(wordbigram_dic)
        information['pos_vec_len'] = len(pos_dic)
        information['posbigram_vec_len'] = len(posbigram_dic)

        json_line = json.JSONEncoder().encode(information)
        featfile.write(json_line + '\n')

        sent_infos = []
        last_doc_id = None
        lt = LoopTimer(update_after=100)

        for word_sent, pos_sent in zip(word_corpus, pos_corpus):
            if word_sent[0] != pos_sent[0]:  # Checking if ids are the same
                continue

            doc_id = word_sent[0]
            pid = word_sent[1]

            words = word_sent[2]
            pos = pos_sent[2]
            wordbigrams = utils.makeBigrams(words)
            posbigrams = utils.makeBigrams(pos)

            word_bow = word_dic.doc2bow(words)
            vec_word_tfidf = word_tfidf[word_bow]

            wordbigram_bow = wordbigram_dic.doc2bow(wordbigrams)
            vec_wordbigram_tfidf = wordbigram_tfidf[wordbigram_bow]

            pos_bow = pos_dic.doc2bow(pos)
            vec_pos_tfidf = pos_tfidf[pos_bow]

            posbigram_bow = posbigram_dic.doc2bow(posbigrams)
            vec_posbigram_tfidf = posbigram_tfidf[posbigram_bow]

            # Collecting Concreteness-Ratings
            cr_min = 1000
            cr_max = 0
            cr_mean = 0

            cr_words = [cr_word for cr_word in words if cr_word in conc_rating]

            for word in cr_words:
                rating = conc_rating[word]
                cr_mean += rating
                if rating > cr_max:
                    cr_max = rating
                if rating < cr_min:
                    cr_min = rating

            if cr_min > cr_max:
                cr_max_feature = 0
                cr_min_feature = 0
                cr_mean_feature = 0
            else:
                cr_mean = cr_mean / len(cr_words)
                cr_max_feature = cr_max
                cr_min_feature = cr_min
                cr_mean_feature = cr_mean

            if (last_doc_id is not None) and (last_doc_id != doc_id):
                max_sent = len(sent_infos)
                for sent_info in sent_infos:
                    sent_info['max_sent'] = max_sent
                    json_line = json.JSONEncoder().encode(sent_info)
                    featfile.write(json_line + '\n')
                sent_infos.clear()

            sent_info = dict()
            sent_info['cr_max_feature'] = cr_max_feature
            sent_info['cr_min_feature'] = cr_min_feature
            sent_info['cr_mean_feature'] = cr_mean_feature
            sent_info['vec_word_tfidf'] = vec_word_tfidf
            sent_info['vec_wordbigram_tfidf'] = vec_wordbigram_tfidf
            sent_info['vec_pos_tfidf'] = vec_pos_tfidf
            sent_info['vec_posbigram_tfidf'] = vec_posbigram_tfidf
            sent_info['id'] = doc_id
            sent_info['paragraphID'] = pid
            sent_info['sent_id'] = len(sent_infos)
            sent_infos.append(sent_info)

            last_doc_id = doc_id

            lt.update("Build AP Features")

        max_sent = len(sent_infos)
        for sent_info in sent_infos:
            sent_info['max_sent'] = max_sent
            json_line = json.JSONEncoder().encode(sent_info)
            featfile.write(json_line + '\n')