termlist_name		= args.termlist_name	# sys.argv[7]	   # Term list name (evaluation list name)					   
	dictionary_name		= args.dictionary_name  # sys.argv[8] 	   # Bilinguam dictionary


	path_vocab_src     	= "../data/train/corpora/" + corpus + '/tmp/vocab_'+source_lang + ".csv"
	path_vocab_tgt     	= "../data/train/corpora/" + corpus + '/tmp/vocab_'+target_lang + ".csv"
	path_ctxvec			= "../data/train/corpora/" + corpus + '/context_vectors/'+corpus+'_'+source_lang + '_w' + str(w) + ".vect"
	path_ctxvec_assoc	= "../data/train/corpora/" + corpus + '/context_vectors/'+corpus+'_'+source_lang + '_w' + str(w) + "_min" + str(min_occ)+ "_"+ assoc + ".assoc"
	path_ctxvec_trad    = "../data/train/corpora/" + corpus + '/context_vectors/'+corpus+'_'+source_lang + '_w' + str(w) + "_min" + str(min_occ)+ "_"+ assoc + ".assoc.trad"
	path_dict 			= "../data/train/dictionaries/" + dictionary_name #dicfrenelda-utf8.txt"
	path_termlist_csv	= "../data/train/termlists/"+ termlist_name
	# ----------------------------------------------------------------------------------------------------


	try: 
		print "Translate " +  DSA.Language_MAP(source_lang) + " source context vectors..."
		# Load source vocabulary
		occ_src 			  = DSA.load_occurrence_vectors(path_vocab_src)
		# Load target vocabulary
		occ_tgt 			  = DSA.load_occurrence_vectors(path_vocab_tgt)
		# Load source context vectors
		context_vectors_assoc = DSA.load_context_vectors(path_ctxvec_assoc)
		# Load term list (evaluation list)
		termlist,termlist_inv =	DSA.load_termlist(path_termlist_csv)	
		# Load bilingual dictionary
		dico,dico_cooc		  = load_fren_elra_dictionary(path_dict,occ_src,occ_tgt,termlist)
		# Translate source context vectors
		context_vectors_trad  = trad_context_vectors(termlist, context_vectors_assoc, dico, dico_cooc, occ_src, occ_tgt)
		# Save translated context vectors
		write_context_vectors(path_ctxvec_trad, context_vectors_trad)
Esempio n. 2
0
    corpus = args.corpus  # sys.argv[1]
    lang = args.lang  # sys.argv[2]	# Language : en/fr/...
    corpus_type = args.corpus_type  # sys.argv[3]	# Flag     : tok/lem/postag
    flag_filter = args.flag_filter  # True if  int(sys.argv[4]) == 1 else False	# Filter stopwords 1/0
    w = int(
        args.w
    )  # int(sys.argv[5]) # : window size 1/2/3... number of words before and after the center word

    corpus_dir = "../data/train/corpora/" + corpus + '/' + corpus_type + '/' + lang
    stopwords_path = "../data/train/stopwords/" + "stopwords_" + lang + ".txt"
    path_vocab = "../data/train/corpora/" + corpus + '/tmp/vocab_' + lang + ".csv"
    path_ctxvec = "../data/train/corpora/" + corpus + '/context_vectors/' + corpus + '_' + lang + '_w' + str(
        w) + ".vect"

    try:
        print "Building " + DSA.Language_MAP(lang) + " context vectors..."

        # Load stopwords
        stopwords = DSA.load_stopwords(stopwords_path)
        # Build word context vectors
        occ, coocc = build_context_vectors(corpus_dir, lang, corpus_type,
                                           stopwords, w)
        # Save vocabulary
        write_vocab(path_vocab, occ)
        # Save Context vectors
        write_context_vectors(path_ctxvec, coocc, occ)

        print "Done."
    except:
        print "Unexpected error ", sys.exc_info()[0]
Esempio n. 3
0
	w 					= int(args.w)			# int(sys.argv[6]) # : window size 1/2/3... number of words before and after the center word
	min_occ				= int(args.min_occ)		# int(sys.argv[7]) # : filtering tokens with number of occurence less than min_occ
	termlist_name		= args.termlist_name	# sys.argv[8]	   # Term list name (evaluation list name)					   

	path_vocab      	= "../data/train/corpora/" + corpus + '/tmp/vocab_'+lang + ".csv"
	path_ctxvec			= "../data/train/corpora/" + corpus + '/context_vectors/'+corpus+'_'+lang + '_w' + str(w) + ".vect"
	path_ctxvec_assoc	= "../data/train/corpora/" + corpus + '/context_vectors/'+corpus+'_'+lang + '_w' + str(w) + "_min" + str(min_occ)+ "_"+ assoc + ".assoc"
	path_termlist_csv	= "../data/train/termlists/" + termlist_name
	# ----------------------------------------------------------------------------------------------------





	try: 
		print "Building " +  DSA.Association_measure_MAP(assoc) + " for " + DSA.Language_MAP(lang) + " Corpus ..."
		# Load occurrence vectors
		occ = DSA.load_occurrence_vectors(path_vocab)
	
		# Load cooccurrence vectors
		context_vectors = DSA.load_context_vectors(path_ctxvec)

		# Compute Contingency Table
		Tab_occ_X,Tab_cooc_XY,Tab_cooc_X_ALL,Tab_cooc_ALL_Y,Total = compute_contingency_table(context_vectors)	

		if assoc.lower() == "mi":

			#Compute Mutual Information
			 context_vectors_assoc = compute_MI(context_vectors,Tab_occ_X,Tab_cooc_XY,Tab_cooc_X_ALL,Tab_cooc_ALL_Y,Total)

		if assoc.lower() == "odds":