Ejemplo n.º 1
0
def chron_order(dataset='training'):

    top_sentences = reranker.select_top(dataset)
    input_directoryPath = getDirectoryPath("outputs/reranker/devtest/")
    output_directoryPath = getDirectoryPath("outputs/reorder/devtest/")
    chron_sents = {}
    for topic_id in top_sentences.keys():
        sentences = top_sentences[topic_id]
        id_part1 = topic_id[:-1]
        id_part2 = topic_id[-1:]
        output_file_name = id_part1 + "-A.M.100." + id_part2 + ".1"
        output_file_path = output_directoryPath + "/" + output_file_name

        chron_list = []
        date_index = defaultdict(list)

        for sentence in sentences:
            date = sentence.time_stamp
            date_index[date].append(sentence)

        for date in sorted(date_index):
            date_sents = date_index[date]
            date_sents.sort(key=lambda x: x.order)
            chron_list.extend(date_sents)

        with io.open(output_file_path, 'w', encoding='utf8') as outputFile:
            for sentence in chron_list:
                outputFile.write(sentence.clean_sent, '\n')
                outputFile.write(' ')
            outputFile.flush()
        outputFile.close()

        chron_sents[topic_id] = chron_list
    return chron_sents
Ejemplo n.º 2
0
def cohesion_order(dataset='training'):
    top_sentences = reranker.select_top(dataset)
    input_directoryPath = getDirectoryPath("outputs/reranker/devtest/")
    output_directoryPath = getDirectoryPath("outputs/reorder/devtest/")
    cohesion_sents = {}

    for topic_id in top_sentences.keys():
        sentences = top_sentences[topic_id]
        id_part1 = topic_id[:-1]
        id_part2 = topic_id[-1:]
        output_file_name = id_part1 + "-A.M.100." + id_part2 + ".1"
        output_file_path = output_directoryPath + "/" + output_file_name

        num_sents = len(sentences)
        clean_sents = [sentence.clean_sent for sentence in sentences]
        vectorizer = TfidfVectorizer()
        cosine_matrix = cosine_similarity(
            vectorizer.fit_transform(clean_sents))
        ids = list(range(num_sents))
        perms = list(permutations(ids, num_sents))
        max_score = 0
        for perm in perms:
            perm_score = 0
            for i in range(num_sents - 1):
                sent1_id = perm[i]
                sent2_id = perm[i + 1]
                adj_sim = cosine_matrix[sent1_id][sent2_id]
                perm_score += adj_sim
            if perm_score > max_score:
                max_score = perm_score
                winning_perm = perm

        cohesion_list = [sentences[i] for i in winning_perm]

        with io.open(output_file_path, 'w', encoding='utf8') as outputFile:
            for sentence in cohesion_list:
                outputFile.write(sentence.clean_sent)
                outputFile.write('\n')
            outputFile.flush()
        outputFile.close()

        cohesion_sents[topic_id] = cohesion_list
    return cohesion_sent
Ejemplo n.º 3
0
def entitygrid_reorder(dataset = 'training'):
    output_directoryPath = getDirectoryPath("outputs/D3/")
    model_file_path = getFilePath("model")
    KNN = 11
    number_of_random_orders = 20

    # reading model
    vectors, labels = readModel(model_file_path)

    # building calssifier
    neigh = KNeighborsClassifier(n_neighbors=KNN)
    neigh.fit(vectors, labels)

    # NER + Dep parser
    ner_tagger = loadStanfordNERTagger()
    stanford_dependency_parser = loadStanfordDependencyParser()

    # page rank + cosine reordering
    top_sentences = reranker.select_top(dataset)

    for topic_id in top_sentences.keys():
        sentences = top_sentences[topic_id]
        id_part1 = topic_id[:-1]
        id_part2 = topic_id[-1:]
        output_file_name = id_part1 + "-A.M.100." + id_part2 + ".1"
        output_file_path = output_directoryPath + "/" + output_file_name

        print("summary ....")
        sentences = [sentence.clean_sent.strip() for sentence in sentences]
        for s in sentences:
            print(s)

        sent_ent_matrix = generateMatrixForSummary(sentences, ner_tagger, stanford_dependency_parser)

        if sent_ent_matrix == None:
            continue

        # original order
        original_order = []
        for key in sent_ent_matrix:
            original_order.append(key)

        # generate random ordering
        print("\n3: random ordering ..")
        random_orders = generateRandomOrders(original_order, number_of_random_orders)

        max_prob = -1
        best_order = []

        # generate vectors for random orders
        for random_order in random_orders:
            print(random_order)
            feature_vector = createFeatureVector(sent_ent_matrix, random_order)
            print(feature_vector)
            scores = neigh.predict_proba(feature_vector)
            print("scores: " + str(scores))
            if scores[0][1] > max_prob:
                max_prob = scores[0][1]
                best_order = random_order
            print("\n")

        # generate vector for original order
        print(original_order)
        feature_vector = createFeatureVector(sent_ent_matrix, original_order)
        print(feature_vector)
        scores = neigh.predict_proba(feature_vector)
        print("scores: " + str(scores))
        if scores[0][1] > max_prob:
            max_prob = scores[0][1]
            best_order = random_order

        print("Best score: " + str(max_prob))
        print(best_order)

        # print best order to the output file
        with io.open(output_file_path, 'w', encoding='utf8') as outputFile:
            for order in best_order:
                outputFile.write(sentences[order]+"\n")
            outputFile.flush()
        outputFile.close()