termlist_name = args.termlist_name # sys.argv[7] # Term list name (evaluation list name) dictionary_name = args.dictionary_name # sys.argv[8] # Bilinguam dictionary path_vocab_src = "../data/train/corpora/" + corpus + '/tmp/vocab_'+source_lang + ".csv" path_vocab_tgt = "../data/train/corpora/" + corpus + '/tmp/vocab_'+target_lang + ".csv" path_ctxvec = "../data/train/corpora/" + corpus + '/context_vectors/'+corpus+'_'+source_lang + '_w' + str(w) + ".vect" path_ctxvec_assoc = "../data/train/corpora/" + corpus + '/context_vectors/'+corpus+'_'+source_lang + '_w' + str(w) + "_min" + str(min_occ)+ "_"+ assoc + ".assoc" path_ctxvec_trad = "../data/train/corpora/" + corpus + '/context_vectors/'+corpus+'_'+source_lang + '_w' + str(w) + "_min" + str(min_occ)+ "_"+ assoc + ".assoc.trad" path_dict = "../data/train/dictionaries/" + dictionary_name #dicfrenelda-utf8.txt" path_termlist_csv = "../data/train/termlists/"+ termlist_name # ---------------------------------------------------------------------------------------------------- try: print "Translate " + DSA.Language_MAP(source_lang) + " source context vectors..." # Load source vocabulary occ_src = DSA.load_occurrence_vectors(path_vocab_src) # Load target vocabulary occ_tgt = DSA.load_occurrence_vectors(path_vocab_tgt) # Load source context vectors context_vectors_assoc = DSA.load_context_vectors(path_ctxvec_assoc) # Load term list (evaluation list) termlist,termlist_inv = DSA.load_termlist(path_termlist_csv) # Load bilingual dictionary dico,dico_cooc = load_fren_elra_dictionary(path_dict,occ_src,occ_tgt,termlist) # Translate source context vectors context_vectors_trad = trad_context_vectors(termlist, context_vectors_assoc, dico, dico_cooc, occ_src, occ_tgt) # Save translated context vectors write_context_vectors(path_ctxvec_trad, context_vectors_trad)
corpus = args.corpus # sys.argv[1] lang = args.lang # sys.argv[2] # Language : en/fr/... corpus_type = args.corpus_type # sys.argv[3] # Flag : tok/lem/postag flag_filter = args.flag_filter # True if int(sys.argv[4]) == 1 else False # Filter stopwords 1/0 w = int( args.w ) # int(sys.argv[5]) # : window size 1/2/3... number of words before and after the center word corpus_dir = "../data/train/corpora/" + corpus + '/' + corpus_type + '/' + lang stopwords_path = "../data/train/stopwords/" + "stopwords_" + lang + ".txt" path_vocab = "../data/train/corpora/" + corpus + '/tmp/vocab_' + lang + ".csv" path_ctxvec = "../data/train/corpora/" + corpus + '/context_vectors/' + corpus + '_' + lang + '_w' + str( w) + ".vect" try: print "Building " + DSA.Language_MAP(lang) + " context vectors..." # Load stopwords stopwords = DSA.load_stopwords(stopwords_path) # Build word context vectors occ, coocc = build_context_vectors(corpus_dir, lang, corpus_type, stopwords, w) # Save vocabulary write_vocab(path_vocab, occ) # Save Context vectors write_context_vectors(path_ctxvec, coocc, occ) print "Done." except: print "Unexpected error ", sys.exc_info()[0]
w = int(args.w) # int(sys.argv[6]) # : window size 1/2/3... number of words before and after the center word min_occ = int(args.min_occ) # int(sys.argv[7]) # : filtering tokens with number of occurence less than min_occ termlist_name = args.termlist_name # sys.argv[8] # Term list name (evaluation list name) path_vocab = "../data/train/corpora/" + corpus + '/tmp/vocab_'+lang + ".csv" path_ctxvec = "../data/train/corpora/" + corpus + '/context_vectors/'+corpus+'_'+lang + '_w' + str(w) + ".vect" path_ctxvec_assoc = "../data/train/corpora/" + corpus + '/context_vectors/'+corpus+'_'+lang + '_w' + str(w) + "_min" + str(min_occ)+ "_"+ assoc + ".assoc" path_termlist_csv = "../data/train/termlists/" + termlist_name # ---------------------------------------------------------------------------------------------------- try: print "Building " + DSA.Association_measure_MAP(assoc) + " for " + DSA.Language_MAP(lang) + " Corpus ..." # Load occurrence vectors occ = DSA.load_occurrence_vectors(path_vocab) # Load cooccurrence vectors context_vectors = DSA.load_context_vectors(path_ctxvec) # Compute Contingency Table Tab_occ_X,Tab_cooc_XY,Tab_cooc_X_ALL,Tab_cooc_ALL_Y,Total = compute_contingency_table(context_vectors) if assoc.lower() == "mi": #Compute Mutual Information context_vectors_assoc = compute_MI(context_vectors,Tab_occ_X,Tab_cooc_XY,Tab_cooc_X_ALL,Tab_cooc_ALL_Y,Total) if assoc.lower() == "odds":