def main(argv):
    
    if len(argv) < 3:
        print("call: translations_spanish_graph_connectstemswithoutstopwords.py graph_file_in.dot graph_file_out.dot [splitmultiwords]")
        sys.exit(1)
        
    split_multiwords = False
    if len(argv) == 4 and argv[3] == "splitmultiwords":
        print("Will split multiwords.")
        split_multiwords = True

    IN = codecs.open(sys.argv[1], "r", "utf-8")
    gr = read(IN.read())
    IN.close()
 
    print("Parse finished.", file=sys.stderr)
    nodes = gr.nodes()

    stemmer = Stemmer.Stemmer('spanish')
    stopwords = qlc.utils.stopwords_from_file("data/stopwords/spa.txt")

    i = 0    
    for n in nodes:
        if "lang" in gr.node[n] and gr.node[n]["lang"] == "spa":
            phrase_without_stopwords = qlc.utils.remove_stopwords(n, stopwords)
            phrase_stems = qlc.utils.stem_phrase(phrase_without_stopwords, stemmer, split_multiwords)
            for stem in phrase_stems:
                stem = stem + "|stem"
                gr.add_node(stem, is_stem=True)
                gr.add_edge(stem, n)
    
    OUT = codecs.open(sys.argv[2], "w", "utf-8")
    OUT.write(write(gr))
    OUT.close()
def main(argv):
    
    if len(argv) < 4:
        print("call: translations_spanish_graph_connectstemswithoutstopwords.py graph_file_in_1.dot graph_file_in_2.dot [...] graph_file_out.dot", file=sys.stderr)
        sys.exit(1)
        
    IN = None
    file = sys.argv[1]
    if not os.path.exists(file):
        files = glob.glob(sys.argv[1])
        if len(files) == 0:
            print("No input files found.", file=sys.stderr)
            sys.exit(1)
        file = files.pop(0)
    else:
        files = argv[2:len(argv)-1]

    print("Processing file {0}.".format(file), file=sys.stderr)
    try:
        IN = codecs.open(file, "r", "utf-8")
    except:
        print("Could not open file {0}.".format(file), file=sys.stderr)
        sys.exit(1)
        
    gr = read(IN.read())
    IN.close()
        
    
    files = argv[2:len(argv)-1]
    
    for f in files:
        print("Processing file {0}.".format(f), file=sys.stderr)
        IN = codecs.open(f, "r", "utf-8")
        gr2 = read(IN.read())
        for node in gr2:
            gr.add_node(node, gr2.node[node])
        for n1, n2 in gr2.edges_iter():
                gr.add_edge(n1, n2, gr2.edge[n1][n2])
        IN.close()
    
    OUT = codecs.open(sys.argv[len(argv)-1], "w", "utf-8")
    OUT.write(write(gr))
    OUT.close()
 def combine_graphs():
     gr = None
     for dictdata_id in loaded_data["dictdata_ids"]:
         #dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id)
         #target_file = "{0}.dot".format(dictdata_string)
         j = generate_dictdata_graph_job(dictdata_id)
         target_file = j.job_id
         IN = codecs.open(target_file, "r", "utf-8")
         if gr == None:
             gr = read(IN.read())
         else:
             gr2 = read(IN.read())
             for node in gr2:
                 gr.add_node(node, gr2.node[node])
             for n1, n2 in gr2.edges_iter():
                 gr.add_edge(n1, n2, gr2.edge[n1][n2])
         IN.close()
     OUT = codecs.open(filename_combined_graph, "w", "utf-8")
     OUT.write(write(gr))
     OUT.close()
def main(argv):
    
    if len(argv) < 3:
        print("call: translations_spanish_graph_connectstemswithoutstopwords.py graph_file_in.dot matrix_file_out.csv")
        sys.exit(1)

    IN = codecs.open(sys.argv[1], "r", "utf-8")
    gr = read(IN.read())
    IN.close()
 
    print("Parse finished.", file=sys.stderr)

    matrix = {}
    sources = set()
    for node in gr:
        if "is_stem" in gr.node[node] and gr.node[node]["is_stem"]:
            spanish_nodes = [n for n in gr[node] if "lang" in gr.node[n] and gr.node[n]["lang"] == "spa"]
            head_nodes = []
            for sp in spanish_nodes:
                head_nodes += [n for n in gr[sp] if ("lang" not in gr.node[n] or gr.node[n]["lang"] != "spa") and ("is_stem" not in gr.node[n] or not gr.node[n]["is_stem"])]
            head_nodes = set(head_nodes)

            heads = collections.defaultdict(list)
            for head in head_nodes:
                (head, source) = head.split("|")
                sources.add(source)
                heads[source].append(head)
            matrix["|".join(sorted(spanish_nodes))] = heads

    OUT = codecs.open(argv[2], "w", "utf-8")
    sorted_sources = sorted(sources)
    OUT.write("{0}\t{1}\n".format("spa", "\t".join(sorted_sources)))
    for spanish in sorted(matrix):
        OUT.write(spanish)
        OUT.write("\t")
        sources_heads = []
        for source in sorted(sources):
            heads = [h for h in matrix[spanish][source]]
            sources_heads.append("|".join(sorted(heads)))
        OUT.write("\t".join(sources_heads))
        OUT.write("\n")
    OUT.close()