def main(argv):
    
    if len(argv) < 3:
        print("call: translations_spanish_graph_connectstemswithoutstopwords.py graph_file_in.dot graph_file_out.dot [splitmultiwords]")
        sys.exit(1)
        
    split_multiwords = False
    if len(argv) == 4 and argv[3] == "splitmultiwords":
        print("Will split multiwords.")
        split_multiwords = True

    IN = codecs.open(sys.argv[1], "r", "utf-8")
    gr = read(IN.read())
    IN.close()
 
    print("Parse finished.", file=sys.stderr)
    nodes = gr.nodes()

    stemmer = Stemmer.Stemmer('spanish')
    stopwords = qlc.utils.stopwords_from_file("data/stopwords/spa.txt")

    i = 0    
    for n in nodes:
        if "lang" in gr.node[n] and gr.node[n]["lang"] == "spa":
            phrase_without_stopwords = qlc.utils.remove_stopwords(n, stopwords)
            phrase_stems = qlc.utils.stem_phrase(phrase_without_stopwords, stemmer, split_multiwords)
            for stem in phrase_stems:
                stem = stem + "|stem"
                gr.add_node(stem, is_stem=True)
                gr.add_edge(stem, n)
    
    OUT = codecs.open(sys.argv[2], "w", "utf-8")
    OUT.write(write(gr))
    OUT.close()
def main(argv):
    
    if len(argv) < 3:
        print("call: translations_spanish_graph.py data_path (bibtex_key|component)")
        sys.exit(1)

    cr = CorpusReaderDict(argv[1])

    dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2])
    if len(dictdata_ids) == 0:
        dictdata_ids = cr.dictdata_ids_for_component(argv[2])
        if len(dictdata_ids) == 0:
            print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2]))
            sys.exit(1)
        

    for dictdata_id in dictdata_ids:
        gr = Graph()
        src_language_iso = cr.src_languages_iso_for_dictdata_id(dictdata_id)
        tgt_language_iso = cr.tgt_languages_iso_for_dictdata_id(dictdata_id)
        if (src_language_iso != ['spa']) and (tgt_language_iso != ['spa']):
            continue
        
        if (len(src_language_iso) > 1) or (len(tgt_language_iso) > 1):
            continue
        
        language_iso = None
        if tgt_language_iso == [ 'spa' ]:
            language_iso = src_language_iso[0]
        else:
            language_iso = tgt_language_iso[0]
                        
        dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id)
        bibtex_key = dictdata_string.split("_")[0]

        for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id):
            if src_language_iso == [ 'spa' ]:
                (head, translation) = (translation, head)
                
            head_with_source = escape_string("{0}|{1}".format(head, bibtex_key))
            translation = escape_string(translation)
            
            #translation_with_language = "{0}|{1}".format(translation, language_iso)
            
            #if head_with_source not in gr:
            gr.add_node(head_with_source, attr_dict={ "lang": language_iso, "source": bibtex_key })
            
            #if translation not in gr:
            gr.add_node(translation, attr_dict={ "lang": "spa" })
                
            #if not gr.has_edge((head_with_source, translation)):
            gr.add_edge(head_with_source, translation)

        output = codecs.open("{0}.dot".format(dictdata_string), "w", "utf-8")
        output.write(write(gr))
        output.close()
def main(argv):
    
    if len(argv) < 4:
        print("call: translations_spanish_graph_connectstemswithoutstopwords.py graph_file_in_1.dot graph_file_in_2.dot [...] graph_file_out.dot", file=sys.stderr)
        sys.exit(1)
        
    IN = None
    file = sys.argv[1]
    if not os.path.exists(file):
        files = glob.glob(sys.argv[1])
        if len(files) == 0:
            print("No input files found.", file=sys.stderr)
            sys.exit(1)
        file = files.pop(0)
    else:
        files = argv[2:len(argv)-1]

    print("Processing file {0}.".format(file), file=sys.stderr)
    try:
        IN = codecs.open(file, "r", "utf-8")
    except:
        print("Could not open file {0}.".format(file), file=sys.stderr)
        sys.exit(1)
        
    gr = read(IN.read())
    IN.close()
        
    
    files = argv[2:len(argv)-1]
    
    for f in files:
        print("Processing file {0}.".format(f), file=sys.stderr)
        IN = codecs.open(f, "r", "utf-8")
        gr2 = read(IN.read())
        for node in gr2:
            gr.add_node(node, gr2.node[node])
        for n1, n2 in gr2.edges_iter():
                gr.add_edge(n1, n2, gr2.edge[n1][n2])
        IN.close()
    
    OUT = codecs.open(sys.argv[len(argv)-1], "w", "utf-8")
    OUT.write(write(gr))
    OUT.close()
 def combine_graphs():
     gr = None
     for dictdata_id in loaded_data["dictdata_ids"]:
         #dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id)
         #target_file = "{0}.dot".format(dictdata_string)
         j = generate_dictdata_graph_job(dictdata_id)
         target_file = j.job_id
         IN = codecs.open(target_file, "r", "utf-8")
         if gr == None:
             gr = read(IN.read())
         else:
             gr2 = read(IN.read())
             for node in gr2:
                 gr.add_node(node, gr2.node[node])
             for n1, n2 in gr2.edges_iter():
                 gr.add_edge(n1, n2, gr2.edge[n1][n2])
         IN.close()
     OUT = codecs.open(filename_combined_graph, "w", "utf-8")
     OUT.write(write(gr))
     OUT.close()
     def generate_dictdata_graph():
         gr = Graph()
         src_language_iso = cr.src_language_iso_for_dictdata_id(dictdata_id)
         tgt_language_iso = cr.tgt_language_iso_for_dictdata_id(dictdata_id)
         if src_language_iso != 'spa' and tgt_language_iso != 'spa':
             raise(NoSpanishException)
         
         language_iso = None
         if tgt_language_iso == 'spa':
             language_iso = src_language_iso
         else:
             language_iso = tgt_language_iso
                         
         bibtex_key = dictdata_string.split("_")[0]
 
         for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id):
             if src_language_iso == 'spa':
                 (head, translation) = (translation, head)
                 
             head_with_source = escape_string("{0}|{1}".format(head, bibtex_key))
             translation = escape_string(translation)
             
             #translation_with_language = "{0}|{1}".format(translation, language_iso)
             
             #if head_with_source not in gr:
             gr.add_node(head_with_source, attr_dict={ "lang": language_iso, "source": bibtex_key })
             
             #if translation not in gr:
             gr.add_node(translation, attr_dict={ "lang": "spa" })
                 
             #if not gr.has_edge((head_with_source, translation)):
             gr.add_edge(head_with_source, translation)
 
         output = codecs.open(target_file, "w", "utf-8")
         output.write(write(gr))
         output.close()
Exemple #6
0
stopwords = qlc.utils.stopwords_from_file("../../src/qlc/data/stopwords/spa.txt")

for node in combined_graph.nodes():
    if "lang" in combined_graph.node[node] and combined_graph.node[node]["lang"] == "spa":
        phrase_without_stopwords = qlc.utils.remove_stopwords(node, stopwords)
        phrase_stems = qlc.utils.stem_phrase(phrase_without_stopwords, stemmer, split_multiwords)
        for stem in phrase_stems:
            stem = stem + "|stem"
            combined_graph_stemmed.add_node(stem, is_stem=True)
            combined_graph_stemmed.add_edge(stem, node)

print(networkx.algorithms.components.number_connected_components(combined_graph_stemmed))

OUT = codecs.open("translation_graph_stemmed.dot", "w", "utf-8")
OUT.write(write(combined_graph_stemmed))
OUT.close()

matrix = {}
sources = set()
for node in combined_graph_stemmed:
    if "is_stem" in combined_graph_stemmed.node[node] and combined_graph_stemmed.node[node]["is_stem"]:
        spanish_nodes = [n for n in combined_graph_stemmed[node] if "lang" in combined_graph_stemmed.node[n] and combined_graph_stemmed.node[n]["lang"] == "spa"]
        head_nodes = []
        for sp in spanish_nodes:
            head_nodes += [n for n in combined_graph_stemmed[sp] if ("lang" not in combined_graph_stemmed.node[n] or combined_graph_stemmed.node[n]["lang"] != "spa") and ("is_stem" not in combined_graph_stemmed.node[n] or not combined_graph_stemmed.node[n]["is_stem"])]
        head_nodes = set(head_nodes)

        heads = collections.defaultdict(list)
        for head in head_nodes:
            (head, source) = head.split("|")
# <codecell>

len(combined_graph_stemmed.nodes())

# <markdowncell>

# ## Export the merged graph as DOT
#
# The graph may now be exported to the DOT format, to be used in other tools for graph analysis or visualization. For this we use a helper function from the [qlc library](https://github.com/pbouda/qlc):

# <codecell>

from qlc.translationgraph import read, write
OUT = codecs.open("translation_graph_stemmed.dot", "w", "utf-8")
OUT.write(write(combined_graph_stemmed))
OUT.close()

# <markdowncell>

# ## Extract a subgraph for the stem of "comer"
#
# As an example how to further process the graph we will extract the subgraph for the stem "comer" now. For this the graph is traversed again until the node "com|stem" is found. All the neighbours of this node are copied to a new graph. We will also remove the sources from the node strings to make the final visualization more readable:

# <codecell>

comer_graph = networkx.Graph()
for node in combined_graph_stemmed:
    if node == "com|stem":
        comer_graph.add_node(node)
        # spanish nodes