def create_sentence_files(): stop_words = set( pd.read_csv('./Primary_data/PersianStopWordList.txt', header=None)[0]) questions = QuickDataFrame.read_csv('./Primary_data/result_filtered.csv', sep=';') topics = QuickDataFrame.read_csv('./Primary_data/topic_vector_Q.csv') files = dict() for tpc in topics.cols: files[tpc + '-p'] = open('./Primary_data/sent_topic/' + tpc + '.p', 'w', encoding='utf-8') files[tpc + '-n'] = open('./Primary_data/sent_topic/' + tpc + '.n', 'w', encoding='utf-8') prog = Progresser(len(questions['sentence'])) # build the train data for i, qrow in enumerate(questions['sentence']): prog.count() snt = [] for word in tokenise(qrow): if word not in stop_words: snt.append(word) snt = ' '.join(snt) for tpc in topics.cols: if topics[tpc][i] == '0': files[tpc + '-n'].write(snt + '\n') elif topics[tpc][i] == '1': files[tpc + '-p'].write(snt + '\n') else: print("wattt") for fl in files.values(): fl.close()
def find_frequent_words(): id_mappings = QuickDataFrame.read_csv( './EurLex_data/eurlex_ID_mappings.csv', sep='\t') words = dict() prog = Progresser(len(id_mappings)) for i in range(len(id_mappings)): prog.count() try: # read the file try: with open('./EurLex_data/lem_txt/' + str(id_mappings['DocID'][i]) + '-lem.txt', 'r', encoding="utf8") as infile: doc_text = infile.read() except IOError as e: # print(e) continue # count the words for word in word_tokenize(doc_text): if word in words: words[word] += 1 else: words[word] = 1 except Exception as e: print(e) # remove bad words cleaner = re.compile('^(19\d\d)$|^(2\d\d\d)$|^((?!\d)\w)*$') filtered_words = dict() for word, freq in words.items(): if cleaner.match(word): filtered_words[word] = freq sorted_words = sorted(filtered_words, key=lambda x: filtered_words[x], reverse=True) with open('./EurLex_data/words_frequency.csv', 'w', encoding="utf8") as outfile: for word in sorted_words: try: outfile.write(str(word) + ',' + str(words[word]) + '\n') except Exception as e: print(word, e) pass with open('./EurLex_data/1000words.csv', 'w', encoding="utf8") as outfile: for word in sorted_words[:1000]: outfile.write(str(word) + '\n')
def load_data(n_x=1000, n_y=160): data = QuickDataFrame.read_csv('./EurLex_data/eurlex_combined_vectors.csv') q_vector_length = 1000 data.delete_column('doc_id') x_list = [data[col] for col in data.cols[:n_x]] x_array = np.array(x_list, dtype=int).transpose() y_list = [] i = -1 for col in data.cols[q_vector_length:]: i += 1 if i < n_y: y_list.append(data[col]) y_array = np.array(y_list, dtype=int).transpose() print('loaded data.') return x_array, y_array
def build_w2v_vectors(): with open('./word2vec/word2vec-En.pkl', 'rb') as infile: w2v = pickle.load(infile) w2v_length = 300 stop_words = set() for w in stopwords.words('english'): stop_words.add(w) id_mappings = QuickDataFrame.read_csv( './EurLex_data/eurlex_ID_mappings.csv', sep='\t') # create DataFrame cols_list = ['doc_id'] + ['w' + str(i) for i in range(0, w2v_length)] train = QuickDataFrame(columns=cols_list) prog = Progresser(len(id_mappings)) for i in range(len(id_mappings)): prog.count() # read the file try: with open('./EurLex_data/lem_txt/' + str(id_mappings['DocID'][i]) + '-lem.txt', 'r', encoding="utf8") as infile: doc_text = infile.read() except IOError: continue try: sum_array = np.zeros(w2v_length) number_of_words = 0 for word in word_tokenize(doc_text): if word not in stop_words and word in w2v: number_of_words += 1 sum_array += w2v[word] if number_of_words > 0: sum_array = sum_array / number_of_words train.append([id_mappings['DocID'][i]] + list(sum_array)) except Exception as e: print(e) train.to_csv('./EurLex_data/w2v_vector_Q.csv')
def find_all_tags(): subject_data = QuickDataFrame.read_csv( './EurLex_data/eurlex_id2class/id2class_eurlex_subject_matter.qrels', header=False, columns=['sub', 'doc_id', 'col2'], sep=' ') subjects = dict() for i in range(len(subject_data)): sub = subject_data['sub'][i] if sub not in subjects: subjects[sub] = 1 else: subjects[sub] += 1 sorted_tags = sorted(subjects, key=lambda x: subjects[x], reverse=True) with open('./EurLex_data/tags.csv', 'w') as outfile: outfile.write('term,freq\n') for tag in sorted_tags: outfile.write(tag + ',' + str(subjects[tag]) + '\n')
def build_all_vectors(): id_mappings = QuickDataFrame.read_csv( './EurLex_data/eurlex_ID_mappings.csv', sep='\t') subject_data = QuickDataFrame.read_csv( './EurLex_data/eurlex_id2class/id2class_eurlex_subject_matter.qrels', header=False, columns=['sub', 'doc_id', 'col2'], sep=' ') words_vector = QuickDataFrame.read_csv('./EurLex_data/1000words.csv', header=False, columns=['term']) topics = QuickDataFrame.read_csv('./EurLex_data/tags.csv') # train = QuickDataFrame.read_csv('./EurLex_data/w2v_vector_Q.csv') # train.set_index(train['doc_id'], unique=True) # create DataFrame cols_list = ['doc_id'] + list(words_vector['term']) train = QuickDataFrame(columns=cols_list) # filling word columns prog = Progresser(len(id_mappings)) for i in range(len(id_mappings)): prog.count() try: # read the file try: with open('./EurLex_data/lem_txt/' + str(id_mappings['DocID'][i]) + '-lem.txt', 'r', encoding="utf8") as infile: doc_text = infile.read() except IOError: continue # add a new row train.append(value=0) # complete the data in that row train['doc_id'][len(train) - 1] = id_mappings['DocID'][i] for word in word_tokenize(doc_text): if word in train.data: train[word][len(train) - 1] = 1 except Exception as e: print(e) # index by doc id train.set_index(train['doc_id'], unique=True) # rename word columns rename_dict = dict() index = 0 for wrd in list(words_vector['term']): rename_dict[wrd] = 'wrd' + str(index) index += 1 train.rename(columns=rename_dict) # add topic columns for col in list(topics['term']): train.add_column(name=col, value=0) # filling topic columns for i in range(len(subject_data)): try: sub = subject_data['sub'][i] doc_id = subject_data['doc_id'][i] train[sub, doc_id] = 1 except Exception as e: print(e) # rename topic columns rename_dict = dict() index = 0 for tpc in list(topics['term']): rename_dict[tpc] = 'tpc' + str(index) index += 1 train.rename(columns=rename_dict) # write to file print('\nWriting to file...') # train.to_csv('./EurLex_data/eurlex_combined_vectors.csv') train.to_csv('./EurLex_data/eurlex_combined_vectors-w2v.csv')