def main(folder_path, lang): os.chdir(folder_path) files = os.listdir(folder_path) for index, file in enumerate(files): if file.endswith(".csv") and lang in file: print(file) df = pd.read_csv(file, dtype=str, sep='\t') i = 0 with open(lang, 'a') as f: l = convert_iso_code(lang) with open(lang + '.en', 'a') as e: with open(lang + '.' + l, 'a') as t: for i, row in df.iterrows(): if isinstance(row['source_text'], str) and isinstance( row['target_text'], str ) and row['item_type'] != 'RESPONSE': source = remove_punctuation_and_lower_case( row['source_text']) target = remove_punctuation_and_lower_case( row['target_text']) f.write(str(i) + '|' + source + '|' + target) i += 1 f.write("\n") e.write(str(i) + '|' + source) e.write("\n") t.write(str(i) + '|' + target) t.write("\n") f.close() e.close() t.close() print(folder_path + "/" + lang + "_dict") mcsq_dict = Word2word.make("en", l, folder_path + "/" + lang, savedir=folder_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--lang1', type=str, required=True, help="ISO 639-1 code of language. " "See `http://opus.nlpl.eu/OpenSubtitles2018.php`") parser.add_argument('--lang2', type=str, required=True, help="ISO 639-1 code of language. " "See `http://opus.nlpl.eu/OpenSubtitles2018.php`") parser.add_argument('--datapref', type=str, default=None, help="data prefix to a custom parallel corpus. " "builds a bilingual lexicon using OpenSubtitles2018 " "unless this option is provided.") parser.add_argument('--n_lines', type=int, default=100000000, help="number of parallel sentences used") parser.add_argument( '--cutoff', type=int, default=5000, help= "number of words that are used in calculating collocates within each language" ) parser.add_argument( '--rerank_width', default=100, type=int, help="maximum number of target-side collocates considered for reranking" ) parser.add_argument( '--rerank_impl', default="multiprocessing", type=str, help= "choice of reranking implementation: simple, multiprocessing (default)" ) parser.add_argument('--cased', dest="cased", action="store_true", help="Keep the case.") parser.add_argument('--n_translations', type=int, default=10, help="number of final word2word translations kept") parser.add_argument('--save_cooccurrence', dest="save_cooccurrence", action="store_true", help="Save the cooccurrence results") parser.add_argument('--save_pmi', dest="save_pmi", action="store_true", help="Save the pmi results") parser.add_argument('--savedir', type=str, default=None, help="location to store bilingual lexicons." "make sure to use this input when loading from " "a custom-bulit lexicon.") parser.add_argument('--num_workers', default=16, type=int, help="number of workers used for multiprocessing") args = parser.parse_args() Word2word.make(**vars(args))