Example #1
0
def main(folder_path, lang):
    os.chdir(folder_path)
    files = os.listdir(folder_path)
    for index, file in enumerate(files):
        if file.endswith(".csv") and lang in file:
            print(file)
            df = pd.read_csv(file, dtype=str, sep='\t')
            i = 0
            with open(lang, 'a') as f:
                l = convert_iso_code(lang)
                with open(lang + '.en', 'a') as e:
                    with open(lang + '.' + l, 'a') as t:
                        for i, row in df.iterrows():
                            if isinstance(row['source_text'],
                                          str) and isinstance(
                                              row['target_text'], str
                                          ) and row['item_type'] != 'RESPONSE':
                                source = remove_punctuation_and_lower_case(
                                    row['source_text'])
                                target = remove_punctuation_and_lower_case(
                                    row['target_text'])
                                f.write(str(i) + '|' + source + '|' + target)
                                i += 1
                                f.write("\n")
                                e.write(str(i) + '|' + source)
                                e.write("\n")
                                t.write(str(i) + '|' + target)
                                t.write("\n")
            f.close()
            e.close()
            t.close()

    print(folder_path + "/" + lang + "_dict")
    mcsq_dict = Word2word.make("en",
                               l,
                               folder_path + "/" + lang,
                               savedir=folder_path)
Example #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--lang1',
                        type=str,
                        required=True,
                        help="ISO 639-1 code of language. "
                        "See `http://opus.nlpl.eu/OpenSubtitles2018.php`")
    parser.add_argument('--lang2',
                        type=str,
                        required=True,
                        help="ISO 639-1 code of language. "
                        "See `http://opus.nlpl.eu/OpenSubtitles2018.php`")
    parser.add_argument('--datapref',
                        type=str,
                        default=None,
                        help="data prefix to a custom parallel corpus. "
                        "builds a bilingual lexicon using OpenSubtitles2018 "
                        "unless this option is provided.")
    parser.add_argument('--n_lines',
                        type=int,
                        default=100000000,
                        help="number of parallel sentences used")
    parser.add_argument(
        '--cutoff',
        type=int,
        default=5000,
        help=
        "number of words that are used in calculating collocates within each language"
    )
    parser.add_argument(
        '--rerank_width',
        default=100,
        type=int,
        help="maximum number of target-side collocates considered for reranking"
    )
    parser.add_argument(
        '--rerank_impl',
        default="multiprocessing",
        type=str,
        help=
        "choice of reranking implementation: simple, multiprocessing (default)"
    )
    parser.add_argument('--cased',
                        dest="cased",
                        action="store_true",
                        help="Keep the case.")
    parser.add_argument('--n_translations',
                        type=int,
                        default=10,
                        help="number of final word2word translations kept")
    parser.add_argument('--save_cooccurrence',
                        dest="save_cooccurrence",
                        action="store_true",
                        help="Save the cooccurrence results")
    parser.add_argument('--save_pmi',
                        dest="save_pmi",
                        action="store_true",
                        help="Save the pmi results")
    parser.add_argument('--savedir',
                        type=str,
                        default=None,
                        help="location to store bilingual lexicons."
                        "make sure to use this input when loading from "
                        "a custom-bulit lexicon.")
    parser.add_argument('--num_workers',
                        default=16,
                        type=int,
                        help="number of workers used for multiprocessing")
    args = parser.parse_args()

    Word2word.make(**vars(args))