def main(argv): if len(argv) < 3: print("call: translations_spanish_graph_connectstemswithoutstopwords.py graph_file_in.dot graph_file_out.dot [splitmultiwords]") sys.exit(1) split_multiwords = False if len(argv) == 4 and argv[3] == "splitmultiwords": print("Will split multiwords.") split_multiwords = True IN = codecs.open(sys.argv[1], "r", "utf-8") gr = read(IN.read()) IN.close() print("Parse finished.", file=sys.stderr) nodes = gr.nodes() stemmer = Stemmer.Stemmer('spanish') stopwords = qlc.utils.stopwords_from_file("data/stopwords/spa.txt") i = 0 for n in nodes: if "lang" in gr.node[n] and gr.node[n]["lang"] == "spa": phrase_without_stopwords = qlc.utils.remove_stopwords(n, stopwords) phrase_stems = qlc.utils.stem_phrase(phrase_without_stopwords, stemmer, split_multiwords) for stem in phrase_stems: stem = stem + "|stem" gr.add_node(stem, is_stem=True) gr.add_edge(stem, n) OUT = codecs.open(sys.argv[2], "w", "utf-8") OUT.write(write(gr)) OUT.close()
def main(argv): if len(argv) < 4: print("call: translations_spanish_graph_connectstemswithoutstopwords.py graph_file_in_1.dot graph_file_in_2.dot [...] graph_file_out.dot", file=sys.stderr) sys.exit(1) IN = None file = sys.argv[1] if not os.path.exists(file): files = glob.glob(sys.argv[1]) if len(files) == 0: print("No input files found.", file=sys.stderr) sys.exit(1) file = files.pop(0) else: files = argv[2:len(argv)-1] print("Processing file {0}.".format(file), file=sys.stderr) try: IN = codecs.open(file, "r", "utf-8") except: print("Could not open file {0}.".format(file), file=sys.stderr) sys.exit(1) gr = read(IN.read()) IN.close() files = argv[2:len(argv)-1] for f in files: print("Processing file {0}.".format(f), file=sys.stderr) IN = codecs.open(f, "r", "utf-8") gr2 = read(IN.read()) for node in gr2: gr.add_node(node, gr2.node[node]) for n1, n2 in gr2.edges_iter(): gr.add_edge(n1, n2, gr2.edge[n1][n2]) IN.close() OUT = codecs.open(sys.argv[len(argv)-1], "w", "utf-8") OUT.write(write(gr)) OUT.close()
def combine_graphs(): gr = None for dictdata_id in loaded_data["dictdata_ids"]: #dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id) #target_file = "{0}.dot".format(dictdata_string) j = generate_dictdata_graph_job(dictdata_id) target_file = j.job_id IN = codecs.open(target_file, "r", "utf-8") if gr == None: gr = read(IN.read()) else: gr2 = read(IN.read()) for node in gr2: gr.add_node(node, gr2.node[node]) for n1, n2 in gr2.edges_iter(): gr.add_edge(n1, n2, gr2.edge[n1][n2]) IN.close() OUT = codecs.open(filename_combined_graph, "w", "utf-8") OUT.write(write(gr)) OUT.close()
def main(argv): if len(argv) < 3: print("call: translations_spanish_graph_connectstemswithoutstopwords.py graph_file_in.dot matrix_file_out.csv") sys.exit(1) IN = codecs.open(sys.argv[1], "r", "utf-8") gr = read(IN.read()) IN.close() print("Parse finished.", file=sys.stderr) matrix = {} sources = set() for node in gr: if "is_stem" in gr.node[node] and gr.node[node]["is_stem"]: spanish_nodes = [n for n in gr[node] if "lang" in gr.node[n] and gr.node[n]["lang"] == "spa"] head_nodes = [] for sp in spanish_nodes: head_nodes += [n for n in gr[sp] if ("lang" not in gr.node[n] or gr.node[n]["lang"] != "spa") and ("is_stem" not in gr.node[n] or not gr.node[n]["is_stem"])] head_nodes = set(head_nodes) heads = collections.defaultdict(list) for head in head_nodes: (head, source) = head.split("|") sources.add(source) heads[source].append(head) matrix["|".join(sorted(spanish_nodes))] = heads OUT = codecs.open(argv[2], "w", "utf-8") sorted_sources = sorted(sources) OUT.write("{0}\t{1}\n".format("spa", "\t".join(sorted_sources))) for spanish in sorted(matrix): OUT.write(spanish) OUT.write("\t") sources_heads = [] for source in sorted(sources): heads = [h for h in matrix[spanish][source]] sources_heads.append("|".join(sorted(heads))) OUT.write("\t".join(sources_heads)) OUT.write("\n") OUT.close()