from parse_debates import DebateParser
import ngrams
import os
import mallet
import build_graph
import networkx

'''
main entry point for generating mallet input files project
'''
if __name__ == "__main__":
    # first parse and load the debates and create the mallet raw input
    parser = DebateParser("./data/debates")
    parser.parse()
    parser.build_text_for_mallet('./data/mallet_raw_statements.txt')

    # now generate and save bigrams for mallet replacement files
    ngrams.save_bigrams_for_replacement_file_txt([item[0] for sublist in parser.statements.values() for item in sublist],
                                      os.path.join("./data/mallet_files", "replacements.txt"))
Esempio n. 2
0
    with open(path, "wb+") as output_file:
        writer = csv.writer(output_file, delimiter="\t")

        if len(statements) > 0:
            statements = [removeNonAscii(statement) for statement in statements]
            ngrams = get_bigram_likelihood(statements)
            if ngrams != '':
                for ngram in ngrams:
                    writer.writerow([ngram[0][0] + '_' + ngram[0][1], ngram[1]])

def save_bigrams_for_replacement_file_txt(statements, path):
    """
    saves likely bigrams in a txt
    :param        statements: list of strings
    :param path: output path for saved txt
    """
    with open(path, "wb+") as output_file:
        if len(statements) > 0:
            statements = [removeNonAscii(statement) for statement in statements]
            ngrams = get_bigram_likelihood(statements)
            if ngrams != '':
                for ngram in ngrams:
                    output_file.write(ngram[0][0] + ' ' + ngram[0][1] + '\n')

if __name__ == "__main__":
    parser = DebateParser("./data/debates")
    parser.parse()
    save_bigram_likelihood_tsv([item[0] for sublist in parser.statements.values() for item in sublist],
                               os.path.join("data", "ngrams.tsv"))
    save_bigrams_for_replacement_file_txt([item[0] for sublist in parser.statements.values() for item in sublist],
                                      os.path.join("./data/mallet_files", "replacements.txt"))
Esempio n. 3
0
                for ngram in ngrams:
                    writer.writerow(
                        [ngram[0][0] + '_' + ngram[0][1], ngram[1]])


def save_bigrams_for_replacement_file_txt(statements, path):
    """
    saves likely bigrams in a txt
    :param        statements: list of strings
    :param path: output path for saved txt
    """
    with open(path, "wb+") as output_file:
        if len(statements) > 0:
            statements = [
                removeNonAscii(statement) for statement in statements
            ]
            ngrams = get_bigram_likelihood(statements)
            if ngrams != '':
                for ngram in ngrams:
                    output_file.write(ngram[0][0] + ' ' + ngram[0][1] + '\n')


if __name__ == "__main__":
    parser = DebateParser("./data/debates")
    parser.parse()
    save_bigram_likelihood_tsv([
        item[0] for sublist in parser.statements.values() for item in sublist
    ], os.path.join("data", "ngrams.tsv"))
    save_bigrams_for_replacement_file_txt([
        item[0] for sublist in parser.statements.values() for item in sublist
    ], os.path.join("./data/mallet_files", "replacements.txt"))
from parse_debates import DebateParser
import ngrams
import os
import mallet
import build_graph
import networkx
'''
main entry point for generating mallet input files project
'''
if __name__ == "__main__":
    # first parse and load the debates and create the mallet raw input
    parser = DebateParser("./data/debates")
    parser.parse()
    parser.build_text_for_mallet('./data/mallet_raw_statements.txt')

    # now generate and save bigrams for mallet replacement files
    ngrams.save_bigrams_for_replacement_file_txt([
        item[0] for sublist in parser.statements.values() for item in sublist
    ], os.path.join("./data/mallet_files", "replacements.txt"))