from parse_debates import DebateParser import ngrams import os import mallet import build_graph import networkx ''' main entry point for generating mallet input files project ''' if __name__ == "__main__": # first parse and load the debates and create the mallet raw input parser = DebateParser("./data/debates") parser.parse() parser.build_text_for_mallet('./data/mallet_raw_statements.txt') # now generate and save bigrams for mallet replacement files ngrams.save_bigrams_for_replacement_file_txt([item[0] for sublist in parser.statements.values() for item in sublist], os.path.join("./data/mallet_files", "replacements.txt"))
with open(path, "wb+") as output_file: writer = csv.writer(output_file, delimiter="\t") if len(statements) > 0: statements = [removeNonAscii(statement) for statement in statements] ngrams = get_bigram_likelihood(statements) if ngrams != '': for ngram in ngrams: writer.writerow([ngram[0][0] + '_' + ngram[0][1], ngram[1]]) def save_bigrams_for_replacement_file_txt(statements, path): """ saves likely bigrams in a txt :param statements: list of strings :param path: output path for saved txt """ with open(path, "wb+") as output_file: if len(statements) > 0: statements = [removeNonAscii(statement) for statement in statements] ngrams = get_bigram_likelihood(statements) if ngrams != '': for ngram in ngrams: output_file.write(ngram[0][0] + ' ' + ngram[0][1] + '\n') if __name__ == "__main__": parser = DebateParser("./data/debates") parser.parse() save_bigram_likelihood_tsv([item[0] for sublist in parser.statements.values() for item in sublist], os.path.join("data", "ngrams.tsv")) save_bigrams_for_replacement_file_txt([item[0] for sublist in parser.statements.values() for item in sublist], os.path.join("./data/mallet_files", "replacements.txt"))
for ngram in ngrams: writer.writerow( [ngram[0][0] + '_' + ngram[0][1], ngram[1]]) def save_bigrams_for_replacement_file_txt(statements, path): """ saves likely bigrams in a txt :param statements: list of strings :param path: output path for saved txt """ with open(path, "wb+") as output_file: if len(statements) > 0: statements = [ removeNonAscii(statement) for statement in statements ] ngrams = get_bigram_likelihood(statements) if ngrams != '': for ngram in ngrams: output_file.write(ngram[0][0] + ' ' + ngram[0][1] + '\n') if __name__ == "__main__": parser = DebateParser("./data/debates") parser.parse() save_bigram_likelihood_tsv([ item[0] for sublist in parser.statements.values() for item in sublist ], os.path.join("data", "ngrams.tsv")) save_bigrams_for_replacement_file_txt([ item[0] for sublist in parser.statements.values() for item in sublist ], os.path.join("./data/mallet_files", "replacements.txt"))
from parse_debates import DebateParser import ngrams import os import mallet import build_graph import networkx ''' main entry point for generating mallet input files project ''' if __name__ == "__main__": # first parse and load the debates and create the mallet raw input parser = DebateParser("./data/debates") parser.parse() parser.build_text_for_mallet('./data/mallet_raw_statements.txt') # now generate and save bigrams for mallet replacement files ngrams.save_bigrams_for_replacement_file_txt([ item[0] for sublist in parser.statements.values() for item in sublist ], os.path.join("./data/mallet_files", "replacements.txt"))