コード例 #1
0
def get_so_vocab(data_file, skip_no_answer=False):
    """
    Iterate through all text of SO data, tokenize, and generate a list
    of the vocabulary.
    :param data_file:
    :return:
    """
    with open(data_file, "rb") as f:
        data = json.load(f)

    vocab = set()
    vocab_freq = collections.Counter()

    for question in data:
        # TODO: Whether to include text of question title?
        question = json.loads(question)

        q_body = question["body"]
        q_body = clean_text(q_body)

        answers = question["answers"]
        comments = question["comments"]

        if skip_no_answer:
            # There is no dialogue because no comments/answers to question
            if len(answers) == 0 and len(comments) == 0:
                continue

        # Extract vocab from question body
        body_set, body_list = extract_text_vocab(q_body)
        vocab_freq.update(body_set)
        vocab.update(body_set)

        # Extract vocab from question comments
        for c in comments:
            c = c.encode("utf-8")
            c = clean_text(c)
            c_voc, c_list = extract_text_vocab(c)
            vocab_freq.update(c_voc)
            vocab.update(c_voc)

        # Extract vocab from question answers and answer comments
        for a in answers:
            a_text = a["text"].encode("utf-8")
            a_text = clean_text(a_text)
            a_voc, a_list = extract_text_vocab(a_text)
            vocab_freq.update(a_voc)
            vocab.update(a_voc)

            a_comments = a["comments"]
            for a_c in a_comments:
                a_c = a_c.encode("utf-8")
                a_c = clean_text(a_c)
                a_c_vocab, a_c_list = extract_text_vocab(a_c)
                vocab_freq.update(a_c_vocab)
                vocab.update(a_c_vocab)


    return vocab, vocab_freq
コード例 #2
0
def data_analysis(fname, models):
    data = pd.read_csv(fname, encoding="utf-8")
    print("analysis: " + fname)
    data = data[["text"]]
    data = data_utils.clean_text(data)
    data["text"] = data["text"].str.replace('[^A-Za-z ]+', "")
    train_data = pd.read_csv('./Training Data/Sentiment.csv')
    data_utils.clean_text(train_data)
    train_data["text"] = train_data["text"].str.replace('[^A-Za-z ]+', "")

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_data["text"])
    sequences = tokenizer.texts_to_sequences(data["text"])
    tweets_pad = pad_sequences(sequences, maxlen=29, padding="post")

    CNN_result = models["CNN"].predict(tweets_pad)
    print("----- CNN complete -----")
    print("CNN_result: " + str(CNN_result.shape))
    CNN_result = handle_result(CNN_result)
    CNN_result.insert(0, "CNN")
    add_column_in_csv(fname, 'result2.csv', CNN_result)

    LSTM_result = models["LSTM"].predict(tweets_pad)
    print("----- LSTM complete -----")
    print("LSTM_result: " + str(LSTM_result.shape))
    LSTM_result = handle_result(LSTM_result)
    LSTM_result.insert(0, "LSTM")
    add_column_in_csv('result2.csv', 'result3.csv', LSTM_result)

    vec = models["DTVectorizer"].transform(data["text"])
    DT_result = models["DecisionTree"].predict(vec)
    print("----- DT complete -----")
    print("DT_result: " + str(DT_result.shape))
    DT_result = DT_result.tolist()
    DT_result.insert(0, "DT")
    add_column_in_csv("result3.csv", "result4.csv", DT_result)

    vec = models["RFVectorizer"].transform(data["text"])
    RF_result = models["RandomForest"].predict(vec)
    print("----- RF complete -----")
    print("RF_result: " + str(RF_result.shape))
    RF_result = RF_result.tolist()
    RF_result.insert(0, "RF")
    add_column_in_csv("result4.csv", fname, RF_result)

    print("----- " + fname + " analysis complete -----")
    os.remove("result2.csv")
    os.remove("result3.csv")
    os.remove("result4.csv")
コード例 #3
0
def get_mailman_vocab(data_file, skip_no_answer=False):
    """
    Iterate through all text of mailman data, tokenize, and generate a list
    of the unique vocabulary tokens.
    :param data_file:
    :return:
    """
    with open(data_file, "rb") as f:
        data = json.load(f)

    vocab = set()
    vocab_freq = collections.Counter()

    # TODO: Whether to process title for vocab?
    for _, thread in data.iteritems():
        if skip_no_answer:
            # No answer given
            if len(thread) == 1:
                continue

        thread_vocab = set()
        for t in thread:
            thread_voc, thread_list = extract_text_vocab(clean_text(t))
            vocab_freq.update(thread_voc)
            thread_vocab.update(thread_voc)

        vocab.update(thread_vocab)

    return vocab, vocab_freq
コード例 #4
0
def extract_explicit_relation(data: pd.DataFrame, required_relation):
    knowledge_list = []
    deficient_relation = []
    norm_relation = data_utils.load_norm_relation(norm_relation_path)
    for idx, row in data.iterrows():
        required = required_relation[row["label"]]

        knowledge_list.append([row["entityName"], "类别", row["label"]])
        knowledge_list.append([
            row["entityName"], "简介",
            data_utils.clean_text(row["instanceAbstract"])
        ])

        if "类别" in required:
            required.remove("类别")
        if "简介" in required:
            required.remove("简介")

        for relation, value in row["instanceInfobox"].items():
            relation = data_utils.clean_text(relation)
            if relation in norm_relation:
                relation = norm_relation[relation]
            value = data_utils.clean_text(value)
            if relation != "" and value != "":
                knowledge_list.append([row["entityName"], relation, value])

            if relation in required:
                required.remove(relation)
        if required:
            deficient_relation.append([row["entityName"], ";".join(required)])

    knowledge_df = pd.DataFrame(knowledge_list,
                                columns=["subject", "predicate", "object"])
    deficient_df = pd.DataFrame(deficient_relation,
                                columns=["entity", "relation"])
    return knowledge_df, deficient_df
コード例 #5
0
def stats_required_relation(data: pd.DataFrame):
    relation_set = {
        "机构": collections.OrderedDict(),
        "概念": collections.OrderedDict(),
        "人物": collections.OrderedDict(),
        "图书": collections.OrderedDict()
    }

    entity_counter = {"机构": 0, "概念": 0, "人物": 0, "图书": 0}

    required_relation = {"机构": [], "概念": [], "人物": [], "图书": []}

    norm_relation = data_utils.load_norm_relation(norm_relation_path)

    for idx, row in data.iterrows():
        entity_counter[row["label"]] += 1
        for relation in [
                data_utils.clean_text(key)
                for key in row["instanceInfobox"].keys()
        ]:
            if relation in norm_relation:
                relation = norm_relation[relation]
            if relation in relation_set[row["label"]]:
                relation_set[row["label"]][relation] += 1
            else:
                relation_set[row["label"]][relation] = 1

    with open("data/baike/temp_relation.txt", "w", encoding="utf-8") as f:
        for key in relation_set.keys():
            f.write("=============================\n")
            f.write("%s %d:\n" % (key, entity_counter[key]))
            for item in sorted(relation_set[key].items(),
                               key=lambda d: d[1],
                               reverse=True):
                f.write("%s: %d\n" % (item[0], item[1]))
    f.close()

    for key in relation_set.keys():
        for relation, num in relation_set[key].items():
            if num > entity_counter[key] * 0.3:
                required_relation[key].append(relation)
    return required_relation
コード例 #6
0
# convert topics to indices

# get topic sequence

max_topic = 5
train_topic = tokens_to_indices(topic_index, train_topic, max_topic)
valid_topic = tokens_to_indices(topic_index, valid_topic, max_topic)
test_topic = tokens_to_indices(topic_index, test_topic, max_topic)

# get topic sequence
train_tp_sq = np.array([np.count_nonzero(t) for t in train_topic])
valid_tp_sq = np.array([np.count_nonzero(t) for t in valid_topic])
test_tp_sq = np.array([np.count_nonzero(t) for t in test_topic])

train_location = [clean_text(lc) for lc in train_location]
valid_location = [clean_text(lc) for lc in valid_location]
test_location = [clean_text(lc) for lc in test_location]

train_location = texts_to_tokens(train_location)
valid_location = texts_to_tokens(valid_location)
test_location = texts_to_tokens(test_location)

train_location = tokens_to_indices(location_index, train_location, 6)
valid_location = tokens_to_indices(location_index, valid_location, 6)
test_location = tokens_to_indices(location_index, test_location, 6)

train_lc_sq = np.array([np.count_nonzero(t) for t in train_location])
valid_lc_sq = np.array([np.count_nonzero(t) for t in valid_location])
test_lc_sq = np.array([np.count_nonzero(t) for t in test_location])
コード例 #7
0
ファイル: create_data.py プロジェクト: mihail911/SNLPDialogue
def gen_java_nlp_data(so_data_fn, mailman_data_fn, sent_outfile):
    """
    Output data to desired format (i.e. ex. id \t src utterance \t tgt utterance).
    SO Data will output dialogues for the following sequences: Q -> [A_1, ..., A_k],
    Q -> [C_1, ..., C_k], and A -> [C_1, ..., C_k] where A_j and C_j denote jth and answer
    and jth comment in sequence for a given question (Q) or answer

    :param so_data_fn Filename containing Stack overflow data (None if not using this data)
    :param mailman_data_fn Filename containing mailman data (None if not using)
    :return:
    """
    output_file = open(sent_outfile, "w")

    a_idx = 1
    if so_data_fn:
        with open(so_data_fn, "rb") as f:
            so_data = json.load(f)
    else:
        so_data = None

    if mailman_data_fn:
        with open(mailman_data_fn, "rb") as f:
            mailman_data = json.load(f)
    else:
        mailman_data = None

    # Read in so_data and output to file
    if so_data:
        for question in so_data:
            # TODO: Whether to include text of question title?
            question = json.loads(question)

            q_body = question["body"].encode("utf-8")
            q_body = clean_text(q_body)

            answers = question["answers"]
            comments = question["comments"]

            # There is no dialogue because no comments/answers to question
            if len(answers) == 0 and len(comments) == 0:
                continue

            # Create dialogue of form (Q, C_1), (Q+C_1, C_2), etc.
            curr_c = ""
            for c in comments:
                c = c.encode("utf-8")
                c = clean_text(c)
                src = q_body + curr_c
                target = c
                output_file.write(str(a_idx) + "\t" + target + "\t" + src + "\n")

                curr_c += " " + c

                a_idx += 1

            # Create dialogue of form (Q, A_1), (Q+A_1, A_2), etc.
            curr_a = ""
            for a in answers:
                a_text = a["text"].encode("utf-8")
                a_text = clean_text(a_text)
                src = q_body + curr_a
                target = a_text
                output_file.write(str(a_idx) + "\t" + target + "\t" + src + "\n")

                curr_a += " " + a_text
                a_idx += 1

                # Also of form (A_1, C_11), (A_1+C_11, C_21), etc.
                a_comments = a["comments"]
                curr_a_c = ""
                for a_c in a_comments:
                    a_c = a_c.encode("utf-8")
                    a_c = clean_text(a_c)
                    src = a_text + curr_a_c
                    target = a_c

                    output_file.write(str(a_idx) + "\t" + target + "\t" + src + "\n")

                    curr_a_c += " " + a_c
                    a_idx += 1

    # Read in mailman_data and output to file
    if mailman_data:
        for _, thread in mailman_data.iteritems():
            # No answer given so no valid dialogue
            if len(thread) == 1:
                continue

            question = thread[0].encode("utf-8)")
            question = clean_text(question)
            curr_a = ""
            for t in thread[1:]:
                # TODO: Remove "-----" string
                t = t.encode("utf-8")
                t = clean_text(t)
                src = question + curr_a
                target = t
                if t == "":
                    continue

                output_file.write(str(a_idx) + "\t" + target + "\t" + src + "\n")

                curr_a += " " + t
                a_idx += 1

    output_file.close()
コード例 #8
0
def get_corpus_in_sentences(doc):
    sentences = [data_utils.clean_text(s) for s in doc.split("。")]
    return sentences
コード例 #9
0
def gen_java_nlp_data(so_data_fn, mailman_data_fn, sent_outfile):
    """
    Output data to desired format (i.e. ex. id \t src utterance \t tgt utterance).
    SO Data will output dialogues for the following sequences: Q -> [A_1, ..., A_k],
    Q -> [C_1, ..., C_k], and A -> [C_1, ..., C_k] where A_j and C_j denote jth and answer
    and jth comment in sequence for a given question (Q) or answer

    :param so_data_fn Filename containing Stack overflow data (None if not using this data)
    :param mailman_data_fn Filename containing mailman data (None if not using)
    :return:
    """
    output_file = open(sent_outfile, "w")

    a_idx = 1
    if so_data_fn:
        with open(so_data_fn, "rb") as f:
            so_data = json.load(f)
    else:
        so_data = None

    if mailman_data_fn:
        with open(mailman_data_fn, "rb") as f:
            mailman_data = json.load(f)
    else:
        mailman_data = None

    # Read in so_data and output to file
    if so_data:
        for question in so_data:
            # TODO: Whether to include text of question title?
            question = json.loads(question)

            q_body = question["body"].encode("utf-8")
            q_body = clean_text(q_body)

            answers = question["answers"]
            comments = question["comments"]

            # There is no dialogue because no comments/answers to question
            if len(answers) == 0 and len(comments) == 0:
                continue

            # Create dialogue of form (Q, C_1), (Q+C_1, C_2), etc.
            curr_c = ""
            for c in comments:
                c = c.encode("utf-8")
                c = clean_text(c)
                src = q_body + curr_c
                target = c
                output_file.write(
                    str(a_idx) + "\t" + target + "\t" + src + "\n")

                curr_c += " " + c

                a_idx += 1

            # Create dialogue of form (Q, A_1), (Q+A_1, A_2), etc.
            curr_a = ""
            for a in answers:
                a_text = a["text"].encode("utf-8")
                a_text = clean_text(a_text)
                src = q_body + curr_a
                target = a_text
                output_file.write(
                    str(a_idx) + "\t" + target + "\t" + src + "\n")

                curr_a += " " + a_text
                a_idx += 1

                # Also of form (A_1, C_11), (A_1+C_11, C_21), etc.
                a_comments = a["comments"]
                curr_a_c = ""
                for a_c in a_comments:
                    a_c = a_c.encode("utf-8")
                    a_c = clean_text(a_c)
                    src = a_text + curr_a_c
                    target = a_c

                    output_file.write(
                        str(a_idx) + "\t" + target + "\t" + src + "\n")

                    curr_a_c += " " + a_c
                    a_idx += 1

    # Read in mailman_data and output to file
    if mailman_data:
        for _, thread in mailman_data.iteritems():
            # No answer given so no valid dialogue
            if len(thread) == 1:
                continue

            question = thread[0].encode("utf-8)")
            question = clean_text(question)
            curr_a = ""
            for t in thread[1:]:
                # TODO: Remove "-----" string
                t = t.encode("utf-8")
                t = clean_text(t)
                src = question + curr_a
                target = t
                if t == "":
                    continue

                output_file.write(
                    str(a_idx) + "\t" + target + "\t" + src + "\n")

                curr_a += " " + t
                a_idx += 1

    output_file.close()