Beispiel #1
0
def create_sentence_files():
    stop_words = set(
        pd.read_csv('./Primary_data/PersianStopWordList.txt', header=None)[0])
    questions = QuickDataFrame.read_csv('./Primary_data/result_filtered.csv',
                                        sep=';')
    topics = QuickDataFrame.read_csv('./Primary_data/topic_vector_Q.csv')

    files = dict()
    for tpc in topics.cols:
        files[tpc + '-p'] = open('./Primary_data/sent_topic/' + tpc + '.p',
                                 'w',
                                 encoding='utf-8')
        files[tpc + '-n'] = open('./Primary_data/sent_topic/' + tpc + '.n',
                                 'w',
                                 encoding='utf-8')

    prog = Progresser(len(questions['sentence']))
    # build the train data
    for i, qrow in enumerate(questions['sentence']):
        prog.count()
        snt = []
        for word in tokenise(qrow):
            if word not in stop_words:
                snt.append(word)
        snt = ' '.join(snt)
        for tpc in topics.cols:
            if topics[tpc][i] == '0':
                files[tpc + '-n'].write(snt + '\n')
            elif topics[tpc][i] == '1':
                files[tpc + '-p'].write(snt + '\n')
            else:
                print("wattt")

    for fl in files.values():
        fl.close()
def find_frequent_words():
    id_mappings = QuickDataFrame.read_csv(
        './EurLex_data/eurlex_ID_mappings.csv', sep='\t')
    words = dict()

    prog = Progresser(len(id_mappings))
    for i in range(len(id_mappings)):
        prog.count()
        try:
            # read the file
            try:
                with open('./EurLex_data/lem_txt/' +
                          str(id_mappings['DocID'][i]) + '-lem.txt',
                          'r',
                          encoding="utf8") as infile:
                    doc_text = infile.read()
            except IOError as e:
                # print(e)
                continue
            # count the words
            for word in word_tokenize(doc_text):
                if word in words:
                    words[word] += 1
                else:
                    words[word] = 1
        except Exception as e:
            print(e)

    # remove bad words
    cleaner = re.compile('^(19\d\d)$|^(2\d\d\d)$|^((?!\d)\w)*$')
    filtered_words = dict()
    for word, freq in words.items():
        if cleaner.match(word):
            filtered_words[word] = freq

    sorted_words = sorted(filtered_words,
                          key=lambda x: filtered_words[x],
                          reverse=True)

    with open('./EurLex_data/words_frequency.csv', 'w',
              encoding="utf8") as outfile:
        for word in sorted_words:
            try:
                outfile.write(str(word) + ',' + str(words[word]) + '\n')
            except Exception as e:
                print(word, e)
                pass

    with open('./EurLex_data/1000words.csv', 'w', encoding="utf8") as outfile:
        for word in sorted_words[:1000]:
            outfile.write(str(word) + '\n')
def load_data(n_x=1000, n_y=160):
    data = QuickDataFrame.read_csv('./EurLex_data/eurlex_combined_vectors.csv')
    q_vector_length = 1000
    data.delete_column('doc_id')
    x_list = [data[col] for col in data.cols[:n_x]]
    x_array = np.array(x_list, dtype=int).transpose()

    y_list = []
    i = -1
    for col in data.cols[q_vector_length:]:
        i += 1
        if i < n_y:
            y_list.append(data[col])
    y_array = np.array(y_list, dtype=int).transpose()
    print('loaded data.')
    return x_array, y_array
def build_w2v_vectors():
    with open('./word2vec/word2vec-En.pkl', 'rb') as infile:
        w2v = pickle.load(infile)

    w2v_length = 300
    stop_words = set()
    for w in stopwords.words('english'):
        stop_words.add(w)

    id_mappings = QuickDataFrame.read_csv(
        './EurLex_data/eurlex_ID_mappings.csv', sep='\t')

    # create DataFrame
    cols_list = ['doc_id'] + ['w' + str(i) for i in range(0, w2v_length)]
    train = QuickDataFrame(columns=cols_list)

    prog = Progresser(len(id_mappings))
    for i in range(len(id_mappings)):
        prog.count()
        # read the file
        try:
            with open('./EurLex_data/lem_txt/' + str(id_mappings['DocID'][i]) +
                      '-lem.txt',
                      'r',
                      encoding="utf8") as infile:
                doc_text = infile.read()
        except IOError:
            continue
        try:
            sum_array = np.zeros(w2v_length)
            number_of_words = 0

            for word in word_tokenize(doc_text):
                if word not in stop_words and word in w2v:
                    number_of_words += 1
                    sum_array += w2v[word]
            if number_of_words > 0:
                sum_array = sum_array / number_of_words

            train.append([id_mappings['DocID'][i]] + list(sum_array))

        except Exception as e:
            print(e)

    train.to_csv('./EurLex_data/w2v_vector_Q.csv')
def find_all_tags():
    subject_data = QuickDataFrame.read_csv(
        './EurLex_data/eurlex_id2class/id2class_eurlex_subject_matter.qrels',
        header=False,
        columns=['sub', 'doc_id', 'col2'],
        sep=' ')
    subjects = dict()
    for i in range(len(subject_data)):
        sub = subject_data['sub'][i]
        if sub not in subjects:
            subjects[sub] = 1
        else:
            subjects[sub] += 1

    sorted_tags = sorted(subjects, key=lambda x: subjects[x], reverse=True)
    with open('./EurLex_data/tags.csv', 'w') as outfile:
        outfile.write('term,freq\n')
        for tag in sorted_tags:
            outfile.write(tag + ',' + str(subjects[tag]) + '\n')
def build_all_vectors():
    id_mappings = QuickDataFrame.read_csv(
        './EurLex_data/eurlex_ID_mappings.csv', sep='\t')
    subject_data = QuickDataFrame.read_csv(
        './EurLex_data/eurlex_id2class/id2class_eurlex_subject_matter.qrels',
        header=False,
        columns=['sub', 'doc_id', 'col2'],
        sep=' ')
    words_vector = QuickDataFrame.read_csv('./EurLex_data/1000words.csv',
                                           header=False,
                                           columns=['term'])
    topics = QuickDataFrame.read_csv('./EurLex_data/tags.csv')

    # train = QuickDataFrame.read_csv('./EurLex_data/w2v_vector_Q.csv')
    # train.set_index(train['doc_id'], unique=True)

    # create DataFrame
    cols_list = ['doc_id'] + list(words_vector['term'])
    train = QuickDataFrame(columns=cols_list)

    # filling word columns
    prog = Progresser(len(id_mappings))
    for i in range(len(id_mappings)):
        prog.count()
        try:
            # read the file
            try:
                with open('./EurLex_data/lem_txt/' +
                          str(id_mappings['DocID'][i]) + '-lem.txt',
                          'r',
                          encoding="utf8") as infile:
                    doc_text = infile.read()
            except IOError:
                continue

            # add a new row
            train.append(value=0)

            # complete the data in that row
            train['doc_id'][len(train) - 1] = id_mappings['DocID'][i]
            for word in word_tokenize(doc_text):
                if word in train.data:
                    train[word][len(train) - 1] = 1
        except Exception as e:
            print(e)

    # index by doc id
    train.set_index(train['doc_id'], unique=True)

    # rename word columns
    rename_dict = dict()
    index = 0
    for wrd in list(words_vector['term']):
        rename_dict[wrd] = 'wrd' + str(index)
        index += 1
    train.rename(columns=rename_dict)

    # add topic columns
    for col in list(topics['term']):
        train.add_column(name=col, value=0)

    # filling topic columns
    for i in range(len(subject_data)):
        try:
            sub = subject_data['sub'][i]
            doc_id = subject_data['doc_id'][i]
            train[sub, doc_id] = 1
        except Exception as e:
            print(e)

    # rename topic columns
    rename_dict = dict()
    index = 0
    for tpc in list(topics['term']):
        rename_dict[tpc] = 'tpc' + str(index)
        index += 1
    train.rename(columns=rename_dict)

    # write to file
    print('\nWriting to file...')
    # train.to_csv('./EurLex_data/eurlex_combined_vectors.csv')
    train.to_csv('./EurLex_data/eurlex_combined_vectors-w2v.csv')