def get_so_vocab(data_file, skip_no_answer=False): """ Iterate through all text of SO data, tokenize, and generate a list of the vocabulary. :param data_file: :return: """ with open(data_file, "rb") as f: data = json.load(f) vocab = set() vocab_freq = collections.Counter() for question in data: # TODO: Whether to include text of question title? question = json.loads(question) q_body = question["body"] q_body = clean_text(q_body) answers = question["answers"] comments = question["comments"] if skip_no_answer: # There is no dialogue because no comments/answers to question if len(answers) == 0 and len(comments) == 0: continue # Extract vocab from question body body_set, body_list = extract_text_vocab(q_body) vocab_freq.update(body_set) vocab.update(body_set) # Extract vocab from question comments for c in comments: c = c.encode("utf-8") c = clean_text(c) c_voc, c_list = extract_text_vocab(c) vocab_freq.update(c_voc) vocab.update(c_voc) # Extract vocab from question answers and answer comments for a in answers: a_text = a["text"].encode("utf-8") a_text = clean_text(a_text) a_voc, a_list = extract_text_vocab(a_text) vocab_freq.update(a_voc) vocab.update(a_voc) a_comments = a["comments"] for a_c in a_comments: a_c = a_c.encode("utf-8") a_c = clean_text(a_c) a_c_vocab, a_c_list = extract_text_vocab(a_c) vocab_freq.update(a_c_vocab) vocab.update(a_c_vocab) return vocab, vocab_freq
def data_analysis(fname, models): data = pd.read_csv(fname, encoding="utf-8") print("analysis: " + fname) data = data[["text"]] data = data_utils.clean_text(data) data["text"] = data["text"].str.replace('[^A-Za-z ]+', "") train_data = pd.read_csv('./Training Data/Sentiment.csv') data_utils.clean_text(train_data) train_data["text"] = train_data["text"].str.replace('[^A-Za-z ]+', "") tokenizer = Tokenizer() tokenizer.fit_on_texts(train_data["text"]) sequences = tokenizer.texts_to_sequences(data["text"]) tweets_pad = pad_sequences(sequences, maxlen=29, padding="post") CNN_result = models["CNN"].predict(tweets_pad) print("----- CNN complete -----") print("CNN_result: " + str(CNN_result.shape)) CNN_result = handle_result(CNN_result) CNN_result.insert(0, "CNN") add_column_in_csv(fname, 'result2.csv', CNN_result) LSTM_result = models["LSTM"].predict(tweets_pad) print("----- LSTM complete -----") print("LSTM_result: " + str(LSTM_result.shape)) LSTM_result = handle_result(LSTM_result) LSTM_result.insert(0, "LSTM") add_column_in_csv('result2.csv', 'result3.csv', LSTM_result) vec = models["DTVectorizer"].transform(data["text"]) DT_result = models["DecisionTree"].predict(vec) print("----- DT complete -----") print("DT_result: " + str(DT_result.shape)) DT_result = DT_result.tolist() DT_result.insert(0, "DT") add_column_in_csv("result3.csv", "result4.csv", DT_result) vec = models["RFVectorizer"].transform(data["text"]) RF_result = models["RandomForest"].predict(vec) print("----- RF complete -----") print("RF_result: " + str(RF_result.shape)) RF_result = RF_result.tolist() RF_result.insert(0, "RF") add_column_in_csv("result4.csv", fname, RF_result) print("----- " + fname + " analysis complete -----") os.remove("result2.csv") os.remove("result3.csv") os.remove("result4.csv")
def get_mailman_vocab(data_file, skip_no_answer=False): """ Iterate through all text of mailman data, tokenize, and generate a list of the unique vocabulary tokens. :param data_file: :return: """ with open(data_file, "rb") as f: data = json.load(f) vocab = set() vocab_freq = collections.Counter() # TODO: Whether to process title for vocab? for _, thread in data.iteritems(): if skip_no_answer: # No answer given if len(thread) == 1: continue thread_vocab = set() for t in thread: thread_voc, thread_list = extract_text_vocab(clean_text(t)) vocab_freq.update(thread_voc) thread_vocab.update(thread_voc) vocab.update(thread_vocab) return vocab, vocab_freq
def extract_explicit_relation(data: pd.DataFrame, required_relation): knowledge_list = [] deficient_relation = [] norm_relation = data_utils.load_norm_relation(norm_relation_path) for idx, row in data.iterrows(): required = required_relation[row["label"]] knowledge_list.append([row["entityName"], "类别", row["label"]]) knowledge_list.append([ row["entityName"], "简介", data_utils.clean_text(row["instanceAbstract"]) ]) if "类别" in required: required.remove("类别") if "简介" in required: required.remove("简介") for relation, value in row["instanceInfobox"].items(): relation = data_utils.clean_text(relation) if relation in norm_relation: relation = norm_relation[relation] value = data_utils.clean_text(value) if relation != "" and value != "": knowledge_list.append([row["entityName"], relation, value]) if relation in required: required.remove(relation) if required: deficient_relation.append([row["entityName"], ";".join(required)]) knowledge_df = pd.DataFrame(knowledge_list, columns=["subject", "predicate", "object"]) deficient_df = pd.DataFrame(deficient_relation, columns=["entity", "relation"]) return knowledge_df, deficient_df
def stats_required_relation(data: pd.DataFrame): relation_set = { "机构": collections.OrderedDict(), "概念": collections.OrderedDict(), "人物": collections.OrderedDict(), "图书": collections.OrderedDict() } entity_counter = {"机构": 0, "概念": 0, "人物": 0, "图书": 0} required_relation = {"机构": [], "概念": [], "人物": [], "图书": []} norm_relation = data_utils.load_norm_relation(norm_relation_path) for idx, row in data.iterrows(): entity_counter[row["label"]] += 1 for relation in [ data_utils.clean_text(key) for key in row["instanceInfobox"].keys() ]: if relation in norm_relation: relation = norm_relation[relation] if relation in relation_set[row["label"]]: relation_set[row["label"]][relation] += 1 else: relation_set[row["label"]][relation] = 1 with open("data/baike/temp_relation.txt", "w", encoding="utf-8") as f: for key in relation_set.keys(): f.write("=============================\n") f.write("%s %d:\n" % (key, entity_counter[key])) for item in sorted(relation_set[key].items(), key=lambda d: d[1], reverse=True): f.write("%s: %d\n" % (item[0], item[1])) f.close() for key in relation_set.keys(): for relation, num in relation_set[key].items(): if num > entity_counter[key] * 0.3: required_relation[key].append(relation) return required_relation
# convert topics to indices # get topic sequence max_topic = 5 train_topic = tokens_to_indices(topic_index, train_topic, max_topic) valid_topic = tokens_to_indices(topic_index, valid_topic, max_topic) test_topic = tokens_to_indices(topic_index, test_topic, max_topic) # get topic sequence train_tp_sq = np.array([np.count_nonzero(t) for t in train_topic]) valid_tp_sq = np.array([np.count_nonzero(t) for t in valid_topic]) test_tp_sq = np.array([np.count_nonzero(t) for t in test_topic]) train_location = [clean_text(lc) for lc in train_location] valid_location = [clean_text(lc) for lc in valid_location] test_location = [clean_text(lc) for lc in test_location] train_location = texts_to_tokens(train_location) valid_location = texts_to_tokens(valid_location) test_location = texts_to_tokens(test_location) train_location = tokens_to_indices(location_index, train_location, 6) valid_location = tokens_to_indices(location_index, valid_location, 6) test_location = tokens_to_indices(location_index, test_location, 6) train_lc_sq = np.array([np.count_nonzero(t) for t in train_location]) valid_lc_sq = np.array([np.count_nonzero(t) for t in valid_location]) test_lc_sq = np.array([np.count_nonzero(t) for t in test_location])
def gen_java_nlp_data(so_data_fn, mailman_data_fn, sent_outfile): """ Output data to desired format (i.e. ex. id \t src utterance \t tgt utterance). SO Data will output dialogues for the following sequences: Q -> [A_1, ..., A_k], Q -> [C_1, ..., C_k], and A -> [C_1, ..., C_k] where A_j and C_j denote jth and answer and jth comment in sequence for a given question (Q) or answer :param so_data_fn Filename containing Stack overflow data (None if not using this data) :param mailman_data_fn Filename containing mailman data (None if not using) :return: """ output_file = open(sent_outfile, "w") a_idx = 1 if so_data_fn: with open(so_data_fn, "rb") as f: so_data = json.load(f) else: so_data = None if mailman_data_fn: with open(mailman_data_fn, "rb") as f: mailman_data = json.load(f) else: mailman_data = None # Read in so_data and output to file if so_data: for question in so_data: # TODO: Whether to include text of question title? question = json.loads(question) q_body = question["body"].encode("utf-8") q_body = clean_text(q_body) answers = question["answers"] comments = question["comments"] # There is no dialogue because no comments/answers to question if len(answers) == 0 and len(comments) == 0: continue # Create dialogue of form (Q, C_1), (Q+C_1, C_2), etc. curr_c = "" for c in comments: c = c.encode("utf-8") c = clean_text(c) src = q_body + curr_c target = c output_file.write(str(a_idx) + "\t" + target + "\t" + src + "\n") curr_c += " " + c a_idx += 1 # Create dialogue of form (Q, A_1), (Q+A_1, A_2), etc. curr_a = "" for a in answers: a_text = a["text"].encode("utf-8") a_text = clean_text(a_text) src = q_body + curr_a target = a_text output_file.write(str(a_idx) + "\t" + target + "\t" + src + "\n") curr_a += " " + a_text a_idx += 1 # Also of form (A_1, C_11), (A_1+C_11, C_21), etc. a_comments = a["comments"] curr_a_c = "" for a_c in a_comments: a_c = a_c.encode("utf-8") a_c = clean_text(a_c) src = a_text + curr_a_c target = a_c output_file.write(str(a_idx) + "\t" + target + "\t" + src + "\n") curr_a_c += " " + a_c a_idx += 1 # Read in mailman_data and output to file if mailman_data: for _, thread in mailman_data.iteritems(): # No answer given so no valid dialogue if len(thread) == 1: continue question = thread[0].encode("utf-8)") question = clean_text(question) curr_a = "" for t in thread[1:]: # TODO: Remove "-----" string t = t.encode("utf-8") t = clean_text(t) src = question + curr_a target = t if t == "": continue output_file.write(str(a_idx) + "\t" + target + "\t" + src + "\n") curr_a += " " + t a_idx += 1 output_file.close()
def get_corpus_in_sentences(doc): sentences = [data_utils.clean_text(s) for s in doc.split("。")] return sentences
def gen_java_nlp_data(so_data_fn, mailman_data_fn, sent_outfile): """ Output data to desired format (i.e. ex. id \t src utterance \t tgt utterance). SO Data will output dialogues for the following sequences: Q -> [A_1, ..., A_k], Q -> [C_1, ..., C_k], and A -> [C_1, ..., C_k] where A_j and C_j denote jth and answer and jth comment in sequence for a given question (Q) or answer :param so_data_fn Filename containing Stack overflow data (None if not using this data) :param mailman_data_fn Filename containing mailman data (None if not using) :return: """ output_file = open(sent_outfile, "w") a_idx = 1 if so_data_fn: with open(so_data_fn, "rb") as f: so_data = json.load(f) else: so_data = None if mailman_data_fn: with open(mailman_data_fn, "rb") as f: mailman_data = json.load(f) else: mailman_data = None # Read in so_data and output to file if so_data: for question in so_data: # TODO: Whether to include text of question title? question = json.loads(question) q_body = question["body"].encode("utf-8") q_body = clean_text(q_body) answers = question["answers"] comments = question["comments"] # There is no dialogue because no comments/answers to question if len(answers) == 0 and len(comments) == 0: continue # Create dialogue of form (Q, C_1), (Q+C_1, C_2), etc. curr_c = "" for c in comments: c = c.encode("utf-8") c = clean_text(c) src = q_body + curr_c target = c output_file.write( str(a_idx) + "\t" + target + "\t" + src + "\n") curr_c += " " + c a_idx += 1 # Create dialogue of form (Q, A_1), (Q+A_1, A_2), etc. curr_a = "" for a in answers: a_text = a["text"].encode("utf-8") a_text = clean_text(a_text) src = q_body + curr_a target = a_text output_file.write( str(a_idx) + "\t" + target + "\t" + src + "\n") curr_a += " " + a_text a_idx += 1 # Also of form (A_1, C_11), (A_1+C_11, C_21), etc. a_comments = a["comments"] curr_a_c = "" for a_c in a_comments: a_c = a_c.encode("utf-8") a_c = clean_text(a_c) src = a_text + curr_a_c target = a_c output_file.write( str(a_idx) + "\t" + target + "\t" + src + "\n") curr_a_c += " " + a_c a_idx += 1 # Read in mailman_data and output to file if mailman_data: for _, thread in mailman_data.iteritems(): # No answer given so no valid dialogue if len(thread) == 1: continue question = thread[0].encode("utf-8)") question = clean_text(question) curr_a = "" for t in thread[1:]: # TODO: Remove "-----" string t = t.encode("utf-8") t = clean_text(t) src = question + curr_a target = t if t == "": continue output_file.write( str(a_idx) + "\t" + target + "\t" + src + "\n") curr_a += " " + t a_idx += 1 output_file.close()