Ejemplo n.º 1
0
def initialization():
    logging.info("Processing arguments...")
    # Getting arguments and options with argparse
    # Initialization of the argparse class
    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__)
    # Mandatory parameters
    ## Input file. Try to open it to check if it exists
    parser.add_argument('input', type=argparse.FileType('rt'), default=None, help="Tab-separated files to be classified")
    parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="Output of the classification")

    ## Parameters required
    groupM = parser.add_argument_group('Mandatory')
    groupM.add_argument('-m', '--metadata', type=argparse.FileType('r'), required=True, help="Training metadata (YAML file). Take into account that explicit command line arguments will overwrite the values from metadata file")

    # Options group
    groupO = parser.add_argument_group('Optional')
    groupO.add_argument("-s", "--source_lang", type=str, help="Source language (SL) of the input")
    groupO.add_argument("-t", "--target_lang", type=str, help="Target language (TL) of the input")
    groupO.add_argument('--tmp_dir', default=gettempdir(), help="Temporary directory where creating the temporary files of this program")
    groupO.add_argument('-b', '--block_size', type=int, default=10000, help="Sentence pairs per block")
    groupO.add_argument('-p', '--processes', type=int, default=max(1, cpu_count()-1), help="Number of processes to use")
    groupO.add_argument('--normalize_by_length', action='store_true', help="Normalize by length in qmax dict feature")
    groupO.add_argument('--treat_oovs', action='store_true', help="Special treatment for OOVs in qmax dict feature")
    groupO.add_argument('--qmax_limit', type=check_positive_or_zero, default=20, help="Number of max target words to be taken into account, sorted by length")    
    groupO.add_argument('--disable_features_quest', action='store_false', help="Disable less important features")
    groupO.add_argument('-g', '--good_examples',  type=check_positive_or_zero, default=50000, help="Number of good examples")
    groupO.add_argument('-w', '--wrong_examples', type=check_positive_or_zero, default=50000, help="Number of wrong examples")
    groupO.add_argument('--good_test_examples',  type=check_positive_or_zero, default=2000, help="Number of good test examples")
    groupO.add_argument('--wrong_test_examples', type=check_positive_or_zero, default=2000, help="Number of wrong test examples")
    groupO.add_argument('-d', '--discarded_tus', type=argparse.FileType('w'), default=None, help="TSV file with discarded TUs. Discarded TUs by the classifier are written in this file in TSV file.")
    groupO.add_argument('--threshold', type=check_positive_between_zero_and_one, default=0.5, help="Threshold for classifier. If accuracy histogram is present in metadata, the interval for max value will be given as a default instead the current default.")
    
    # Logging group
    groupL = parser.add_argument_group('Logging')
    groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode')
    groupL.add_argument('--debug', action='store_true', help='Debug logging mode')
    groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file")
    groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit")

    # Validating & parsing
    # Checking if metadata is specified
    preliminary_args = parser.parse_args()
    if preliminary_args.metadata != None:
        # If so, we load values from metadata
        metadata_yaml = yaml.load(preliminary_args.metadata)
        threshold = np.argmax(metadata_yaml["accuracy_histogram"])*0.1
        logging.info("Accuracy histogram: {}".format(metadata_yaml["accuracy_histogram"]))
        logging.info("Ideal threshold: {:1.1f}".format(threshold))
        metadata_yaml["threshold"] = threshold
        logging.debug("YAML")
        logging.debug(metadata_yaml)
        parser.set_defaults(**metadata_yaml)
    # Then we build again the parameters to overwrite the metadata values if their options were explicitly specified in command line arguments
    args = parser.parse_args()
    logging_setup(args)
    
    # Extra-checks for args here
    # Load dictionaries
    args.dict_sl_tl = ProbabilisticDictionary(args.source_dictionary)
    args.dict_tl_sl = ProbabilisticDictionary(args.target_dictionary)
    # Load classifier
    args.clf = joblib.load(args.classifier)

    # Ensure that directory exists; if not, create it
    if not os.path.exists(args.tmp_dir):
        os.makedirs(args.tmp_dir)

    logging.debug("Arguments processed: {}".format(str(args)))
    logging.info("Arguments processed.")
    return args
Ejemplo n.º 2
0
def initialization():
    global nline
    global logging_level
    
    nline = 0
    logging.info("Processing arguments...")
    # Getting arguments and options with argparse
    # Initialization of the argparse class
    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__)
    # Mandatory parameters
    ## Input file. Try to open it to check if it exists
    parser.add_argument('input', type=argparse.FileType('rt'), default=None, help="Tab-separated files to be classified")      
    parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="Output of the classification")
    parser.add_argument('metadata', type=argparse.FileType('r'), default=None, help="Training metadata (YAML file)")    

    # Options group
    groupO = parser.add_argument_group('Optional')
    groupO.add_argument("-S", "--source_tokeniser_path", type=str, help="Source language (SL) tokeniser executable absolute path")
    groupO.add_argument("-T", "--target_tokeniser_path", type=str, help="Target language (TL) tokeniser executable absolute path")

    groupO.add_argument("--scol", default=3, type=check_positive, help ="Source sentence column (starting in 1)")
    groupO.add_argument("--tcol", default=4, type=check_positive, help ="Target sentence column (starting in 1)")    


    groupO.add_argument('--tmp_dir', default=gettempdir(), help="Temporary directory where creating the temporary files of this program")
    groupO.add_argument('-d', '--discarded_tus', type=argparse.FileType('w'), default=None, help="TSV file with discarded TUs. Discarded TUs by the classifier are written in this file in TSV file.")
    groupO.add_argument('--threshold', type=check_positive_between_zero_and_one, default=0.5, help="Threshold for classifier. If accuracy histogram is present in metadata, the interval for max value will be given as a default instead the current default.")
    groupO.add_argument('--lm_threshold',type=check_positive_between_zero_and_one, default=0.5, help="Threshold for language model fluency scoring. All TUs whose LM fluency score falls below the threshold will are removed (classifier score set to 0), unless the option --keep_lm_result set.")
    groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score and do not discard any TU based on that score.")
     
    groupO.add_argument('--score_only',action='store_true', help="Only output one column which is the bicleaner score", default=False)
     
    groupO.add_argument('--disable_hardrules',action = 'store_true', help = "Disables the bicleaner_hardrules filtering (only bicleaner_classify is applied)")
    groupO.add_argument('--disable_lang_ident', default=False, action='store_true', help="Don't apply hardrules that use language detecting")
    
    # Logging group
    groupL = parser.add_argument_group('Logging')
    groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode')
    groupL.add_argument('--debug', action='store_true', help='Debug logging mode')
    groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file")
    groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit")

    # Validating & parsing
    # Checking if metadata is specified
    args = parser.parse_args()
    logging_setup(args)
    
    logging_level = logging.getLogger().level    

    if logging_level <= logging.WARNING and logging_level != logging.DEBUG:
        #Getting rid of INFO messages when Moses processes start
        logging.getLogger("MosesTokenizer").setLevel(logging.WARNING)
        logging.getLogger("MosesSentenceSplitter").setLevel(logging.WARNING)
        logging.getLogger("MosesPunctuationNormalizer").setLevel(logging.WARNING)
            
    try: 
        yamlpath = os.path.dirname(os.path.abspath(args.metadata.name))

        metadata_yaml = yaml.load(args.metadata)      

        args.source_lang=metadata_yaml["source_lang"]
        args.target_lang=metadata_yaml["target_lang"]
        if "source_tokeniser_path" in metadata_yaml:
            args.source_tokeniser_path=metadata_yaml["source_tokeniser_path"]
        if "target_tokeniser_path" in metadata_yaml:
            args.target_tokeniser_path=metadata_yaml["target_tokeniser_path"]

        try:
            args.clf=joblib.load( os.path.join( yamlpath , metadata_yaml["classifier"]))
        except:            
            args.clf=joblib.load(metadata_yaml["classifier"])
        
#        args.clf.n_jobs = None    
        args.classifier_type=metadata_yaml["classifier_type"]


        try:
            args.dict_sl_tl = ProbabilisticDictionary( os.path.join(yamlpath , metadata_yaml["source_dictionary"]))
        except:
            args.dict_sl_tl = ProbabilisticDictionary(metadata_yaml["source_dictionary"])                
        try:            
            args.dict_tl_sl = ProbabilisticDictionary( os.path.join(yamlpath , metadata_yaml["target_dictionary"]))        
        except:
            args.dict_tl_sl = ProbabilisticDictionary(metadata_yaml["target_dictionary"])        
        
                
        args.normalize_by_length = metadata_yaml["normalize_by_length"]
        args.treat_oovs = metadata_yaml["treat_oovs"]
        args.qmax_limit = metadata_yaml["qmax_limit"]
        args.disable_features_quest = metadata_yaml["disable_features_quest"]
        args.good_examples = metadata_yaml["good_examples"]
        args.wrong_examples = metadata_yaml["wrong_examples"]
        args.good_test_examples = metadata_yaml["good_test_examples"]
        args.wrong_test_examples = metadata_yaml["wrong_test_examples"]
        args.length_ratio = metadata_yaml["length_ratio"]
        args.features_version = 1 if "features_version" not in metadata_yaml else int(metadata_yaml["features_version"])
        
        threshold = np.argmax(metadata_yaml["accuracy_histogram"])*0.1
        logging.info("Accuracy histogram: {}".format(metadata_yaml["accuracy_histogram"]))
        logging.info("Ideal threshold: {:1.1f}".format(threshold))
        metadata_yaml["threshold"] = threshold
        
        #Load LM stuff if model was trained with it 
        if "source_lm" in metadata_yaml and "target_lm" in metadata_yaml:
            lmFilter = DualLMFluencyFilter( LMType[metadata_yaml['lm_type']] ,args.source_lang, args.target_lang)
            stats=DualLMStats( metadata_yaml['clean_mean_perp'],metadata_yaml['clean_stddev_perp'],metadata_yaml['noisy_mean_perp'],metadata_yaml['noisy_stddev_perp'] )
            
            fullpath_source_lm=os.path.join(yamlpath,metadata_yaml['source_lm'])
            if os.path.isfile(fullpath_source_lm):
                source_lm= fullpath_source_lm
            else:
                source_lm= metadata_yaml['source_lm']
            
            fullpath_target_lm=os.path.join(yamlpath,metadata_yaml['target_lm'])
            if os.path.isfile(fullpath_target_lm):
                target_lm=fullpath_target_lm
            else:
                target_lm=metadata_yaml['target_lm']
            lmFilter.load(source_lm, target_lm ,stats)
            args.lm_filter=lmFilter
        else:
            args.lm_filter=None
        
        logging.debug("YAML")
        logging.debug(metadata_yaml)
        parser.set_defaults(**metadata_yaml)   
   
    except:
        print("Error loading metadata")
        traceback.print_exc()
        sys.exit(1)
    
    # Ensure that directory exists; if not, create it
    if not os.path.exists(args.tmp_dir):
        os.makedirs(args.tmp_dir)

    if args.score_only and args.keep_lm_result:
        raise AssertionError("Conflicting arguments: cannot output bicleaner score only AND keep language model result")

    logging.debug("Arguments processed: {}".format(str(args)))
    logging.info("Arguments processed.")
    return args
Ejemplo n.º 3
0
def perform_training(args):
    global nline
    time_start = default_timer()
    logging.info("Starting process")

    # Read input to a named temporary file
    # We may need to read it multiple times and that would be problematic if it is sys.stdin
    input = NamedTemporaryFile(mode="w", delete=False)
    for line in args.input:
        input.write(line)
    input.close()

    stats = None
    with open(input.name) as input_f:
        args.input = input_f
        stats = train_fluency_filter(args)
        args.input.seek(0)

        # Shuffle and get length ratio
        total_size, length_ratio, good_sentences, wrong_sentences = shuffle(
            args.input, args.good_examples + args.good_test_examples,
            args.wrong_examples + args.wrong_test_examples,
            args.wrong_examples_file)
    os.remove(input.name)

    args.length_ratio = length_ratio

    # Load dictionaries
    args.dict_sl_tl = ProbabilisticDictionary(args.source_dictionary)
    args.dict_tl_sl = ProbabilisticDictionary(args.target_dictionary)

    features_file = NamedTemporaryFile(delete=False)
    if args.source_tokeniser_path:
        tokl = ToolWrapper(args.source_tokeniser_path.split(' '))
    else:
        tokl = MosesTokenizer(args.source_lang)
    if args.target_tokeniser_path:
        tokr = ToolWrapper(args.target_tokeniser_path.split(' '))
    else:
        tokr = MosesTokenizer(args.target_lang)
    with open(good_sentences.name, 'r') as gsf, \
            open(wrong_sentences.name, 'r') as wsf, \
            open(features_file.name, 'w+') as fileout:

        for i in gsf:
            srcsen, trgsen = i.split("\t")[:2]
            #            print(str(i) + " ---" + str(srcsen) + " --- " + str(trgsen))
            features = feature_extract(srcsen, trgsen, tokl, tokr, args)
            for j in features:
                fileout.write("{}".format(j))
                fileout.write("\t")
            fileout.write("{}".format(1))
            fileout.write("\n")
        fileout.flush()

        for i in wsf:
            srcsen, trgsen = i.split("\t")[:2]
            #            print(str(i) + " ---" + str(srcsen) + " --- " + str(trgsen))
            features = feature_extract(srcsen, trgsen, tokl, tokr, args)
            for j in features:
                fileout.write("{}".format(j))
                fileout.write("\t")
            fileout.write("{}".format(0))
            fileout.write("\n")
        fileout.flush()
    tokl.close()
    tokr.close()

    features_file.seek(0)

    if args.dump_features:
        logging.info("Dumping features to " +
                     os.path.abspath(args.dump_features.name))
        for i in features_file:
            args.dump_features.write(i)
        args.dump_features.close()
        features_file.seek(0)

    logging.info("Start training")
    features_file.close()

    hgood = []
    hwrong = []
    with TemporaryFile("w+") as features_train, TemporaryFile(
            "w+") as features_test, open(features_file.name, 'r') as ff:
        nline = 0
        for line in ff:
            #            print(line)
            if nline < args.good_examples:
                features_train.write(line)
            elif nline < args.good_examples + args.good_test_examples:
                features_test.write(line)
            elif nline < args.good_examples + args.good_test_examples + args.wrong_examples:
                features_train.write(line)
            else:
                features_test.write(line)
            nline += 1

        features_train.flush()
        features_test.flush()

        features_train.seek(0)
        features_test.seek(0)
        hgood, hwrong = train_classifier(features_train, features_test,
                                         args.classifier_type, args.classifier)
        features_train.close()
        features_test.close()

    logging.info("End training")

    write_metadata(args, length_ratio, hgood, hwrong, stats)
    args.metadata.close()

    # Stats
    logging.info("Finished")
    elapsed_time = default_timer() - time_start
    logging.info("Elapsed time {:.2f} s".format(elapsed_time))
def initialization():
    global logging_level

    logging.info("Processing arguments...")
    # Getting arguments and options with argparse
    # Initialization of the argparse class
    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=__doc__)
    # Mandatory parameters
    ## Input file. Try to open it to check if it exists
    parser.add_argument('input',
                        type=argparse.FileType('rt'),
                        default=None,
                        help="Tab-separated files to be classified")
    parser.add_argument('output',
                        nargs='?',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help="Output of the classification")
    parser.add_argument('metadata',
                        type=argparse.FileType('r'),
                        default=None,
                        help="Training metadata (YAML file)")

    ## Parameters required
    #groupM = parser.add_argument_group('Mandatory')

    # Options group
    groupO = parser.add_argument_group('Optional')
    groupO.add_argument("-S",
                        "--source_tokenizer_command",
                        type=str,
                        help="Source language (SL) tokenizer full command")
    groupO.add_argument("-T",
                        "--target_tokenizer_command",
                        type=str,
                        help="Target language (TL) tokenizer full command")

    groupO.add_argument("--scol",
                        default=3,
                        type=check_positive,
                        help="Source sentence column (starting in 1)")
    groupO.add_argument("--tcol",
                        default=4,
                        type=check_positive,
                        help="Target sentence column (starting in 1)")

    groupO.add_argument(
        '--tmp_dir',
        default=gettempdir(),
        help=
        "Temporary directory where creating the temporary files of this program"
    )
    groupO.add_argument('-b',
                        '--block_size',
                        type=int,
                        default=200,
                        help="Sentence pairs per block")
    groupO.add_argument('-p',
                        '--processes',
                        type=int,
                        default=max(1,
                                    cpu_count() - 1),
                        help="Number of processes to use")

    groupO.add_argument(
        '-d',
        '--discarded_tus',
        type=argparse.FileType('w'),
        default=None,
        help=
        "TSV file with discarded TUs. Discarded TUs by the classifier are written in this file in TSV file."
    )
    groupO.add_argument(
        '--lm_threshold',
        type=check_positive_between_zero_and_one,
        default=0.5,
        help=
        "Threshold for language model fluency scoring. All TUs whose LM fluency score falls below the threshold will are removed (classifier score set to 0)"
    )
    #groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score and do not discard any TU based on that score.")

    groupO.add_argument(
        '--score_only',
        action='store_true',
        help="Only output one column which is the bicleaner score",
        default=False)
    groupO.add_argument(
        '--disable_hardrules',
        action='store_true',
        help=
        "Disables the bicleaner_hardrules filtering (only bicleaner_classify is applied)"
    )
    groupO.add_argument('--disable_lm_filter',
                        action='store_true',
                        help="Disables LM filtering")
    groupO.add_argument('--disable_porn_removal',
                        default=False,
                        action='store_true',
                        help="Don't apply p**n removal")
    groupO.add_argument('--disable_minimal_length',
                        default=False,
                        action='store_true',
                        help="Don't apply minimal length rule")

    # Logging group
    groupL = parser.add_argument_group('Logging')
    groupL.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Silent logging mode')
    groupL.add_argument('--debug',
                        action='store_true',
                        help='Debug logging mode')
    groupL.add_argument('--logfile',
                        type=argparse.FileType('a'),
                        default=sys.stderr,
                        help="Store log to a file")
    groupL.add_argument('-v',
                        '--version',
                        action='version',
                        version="%(prog)s " + __version__,
                        help="show version of this script and exit")

    # Validating & parsing
    # Checking if metadata is specified
    args = parser.parse_args()
    logging_setup(args)

    logging_level = logging.getLogger().level

    try:
        metadata_yaml = yaml.safe_load(args.metadata)
        yamlpath = os.path.dirname(os.path.abspath(args.metadata.name))
        metadata_yaml["yamlpath"] = yamlpath

        args.source_lang = metadata_yaml["source_lang"]
        args.target_lang = metadata_yaml["target_lang"]
        if "source_tokenizer_command" in metadata_yaml:
            args.source_tokenizer_command = metadata_yaml[
                "source_tokenizer_command"]
        if "target_tokenizer_command" in metadata_yaml:
            args.target_tokenizer_command = metadata_yaml[
                "target_tokenizer_command"]

        try:
            args.clf = joblib.load(
                os.path.join(yamlpath, metadata_yaml["classifier"]))
        except:
            args.clf = joblib.load(metadata_yaml["classifier"])

#        args.clf.n_jobs = None
        args.classifier_type = metadata_yaml["classifier_type"]

        try:
            args.dict_sl_tl = ProbabilisticDictionary(
                os.path.join(yamlpath, metadata_yaml["source_dictionary"]))
        except:
            args.dict_sl_tl = ProbabilisticDictionary(
                metadata_yaml["source_dictionary"])
        try:
            args.dict_tl_sl = ProbabilisticDictionary(
                os.path.join(yamlpath, metadata_yaml["target_dictionary"]))
        except:
            args.dict_tl_sl = ProbabilisticDictionary(
                metadata_yaml["target_dictionary"])

        try:
            args.sl_word_freqs = WordZipfFreqDist(
                os.path.join(yamlpath, metadata_yaml["source_word_freqs"]))
        except:
            try:
                args.sl_word_freqs = WordZipfFreqDist(
                    metadata_yaml["source_word_freqs"])
            except:
                args.sl_word_freqs = None
        try:
            args.tl_word_freqs = WordZipfFreqDist(
                os.path.join(yamlpath, metadata_yaml["target_word_freqs"]))
        except:
            try:
                args.tl_word_freqs = WordZipfFreqDist(
                    metadata_yaml["target_word_freqs"])
            except:
                args.tl_word_freqs = None

        args.normalize_by_length = metadata_yaml["normalize_by_length"]
        args.treat_oovs = metadata_yaml["treat_oovs"]
        args.qmax_limit = metadata_yaml["qmax_limit"]
        args.disable_features_quest = metadata_yaml["disable_features_quest"]
        args.length_ratio = metadata_yaml["length_ratio"]
        args.features_version = 1 if "features_version" not in metadata_yaml else int(
            metadata_yaml["features_version"])

        threshold = np.argmax(metadata_yaml["accuracy_histogram"]) * 0.1
        logging.info("Accuracy histogram: {}".format(
            metadata_yaml["accuracy_histogram"]))
        logging.info("Ideal threshold: {:1.1f}".format(threshold))
        metadata_yaml["threshold"] = threshold

        #Try loading metadata for LM filtering
        if not args.disable_lm_filter:
            if not ("source_lm" in metadata_yaml
                    and "target_lm" in metadata_yaml):
                args.disable_lm_filter = True
                logging.warning(
                    "LM filter not present in metadata, disabling.")
        else:
            logging.info("LM filtering disabled")

        if not args.disable_porn_removal:
            if not ("porn_removal_file" in metadata_yaml
                    and "porn_removal_side" in metadata_yaml):
                args.disable_porn_removal = True
                logging.warning(
                    "P**n removal not present in metadata, disabling.")
            else:
                try:
                    args.porn_removal = fasttext.load_model(
                        os.path.join(yamlpath,
                                     metadata_yaml['porn_removal_file']))
                except:
                    args.porn_removal = fasttext.load_model(
                        args.metadata_yaml['porn_removal_file'])
        else:
            logging.info("P**n removal disabled")

        if "disable_lang_ident" in metadata_yaml:
            args.disable_lang_ident = metadata_yaml["disable_lang_ident"]
        else:
            args.disable_lang_ident = False

        logging.debug("YAML")
        logging.debug(metadata_yaml)
        args.metadata_yaml = metadata_yaml
        parser.set_defaults(**metadata_yaml)

    except:
        logging.error("Error loading metadata")
        traceback.print_exc()
        sys.exit(1)

    # Ensure that directory exists; if not, create it
    if not os.path.exists(args.tmp_dir):
        os.makedirs(args.tmp_dir)

    logging.debug("Arguments processed: {}".format(str(args)))
    logging.info("Arguments processed.")
    return args
Ejemplo n.º 5
0
def perform_training(args):
    time_start = default_timer()
    logging.info("Starting process")
    logging.info("Running {0} workers at {1} rows per block".format(
        args.processes, args.block_size))

    process_count = max(1, args.processes)
    maxsize = 1000 * process_count

    output_queue = Queue(maxsize=maxsize)
    worker_count = process_count

    #Read input to a named temporary file
    #We may need to read it multiple times and that would be problematic if it is sys.stdin
    input = NamedTemporaryFile(mode="w", delete=False)
    for line in args.input:
        input.write(line)
    input.close()

    stats = None
    with open(input.name) as input_f:
        args.input = input_f
        stats = train_fluency_filter(args)
        input_f.seek(0)

        # Shuffle and get length ratio
        total_size, length_ratio, good_sentences, wrong_sentences = shuffle(
            args.input, args.good_examples + args.good_test_examples,
            args.wrong_examples + args.wrong_test_examples,
            args.wrong_examples_file)
    os.remove(input.name)

    args.length_ratio = length_ratio

    # Load dictionaries
    args.dict_sl_tl = ProbabilisticDictionary(args.source_dictionary)
    args.dict_tl_sl = ProbabilisticDictionary(args.target_dictionary)

    features_file = TemporaryFile('w+')
    # Start reducer
    reduce = Process(target=reduce_process, args=(output_queue, features_file))
    reduce.start()

    # Start workers
    jobs_queue = Queue(maxsize=maxsize)
    workers = []
    for i in range(worker_count):
        worker = Process(target=worker_process,
                         args=(i, jobs_queue, output_queue, args))
        worker.daemon = True  # dies with the parent process
        worker.start()
        workers.append(worker)

    # Mapper process (foreground - parent)
    last_block = map_process(good_sentences, args.block_size, jobs_queue, 1, 0)
    good_sentences.close()

    map_process(wrong_sentences, args.block_size, jobs_queue, 0,
                last_block + 1)
    wrong_sentences.close()

    # Worker termination
    for _ in workers:
        jobs_queue.put(None)

    logging.info("End mapping")

    for w in workers:
        w.join()

    # Reducer termination
    output_queue.put(None)
    reduce.join()

    features_file.seek(0)

    if args.dump_features:
        logging.info("Dumping features to " +
                     os.path.abspath(args.dump_features.name))
        for i in features_file:
            args.dump_features.write(i)
        args.dump_features.close()
        features_file.seek(0)

    logging.info("Start training")

    hgood = []
    hwrong = []
    with TemporaryFile("w+") as features_train, TemporaryFile(
            "w+") as features_test:
        nline = 0
        for line in features_file:
            if nline < args.good_examples:
                features_train.write(line)
            elif nline < args.good_examples + args.good_test_examples:
                features_test.write(line)
            elif nline < args.good_examples + args.good_test_examples + args.wrong_examples:
                features_train.write(line)
            else:
                features_test.write(line)
            nline += 1

        features_train.flush()
        features_test.flush()

        features_train.seek(0)
        features_test.seek(0)
        hgood, hwrong = train_classifier(features_train, features_test,
                                         args.classifier_type, args.classifier)
        features_train.close()
        features_test.close()

    logging.info("End training")

    write_metadata(args, length_ratio, hgood, hwrong, stats)
    args.metadata.close()

    # Stats
    logging.info("Finished")
    elapsed_time = default_timer() - time_start
    logging.info("Elapsed time {:.2f} s".format(elapsed_time))
Ejemplo n.º 6
0
def initialization():
    logging.info("Processing arguments...")
    # Getting arguments and options with argparse
    # Initialization of the argparse class
    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__)
    # Mandatory parameters
    ## Input file. Try to open it to check if it exists
    parser.add_argument('input', type=argparse.FileType('rt'), default=None, help="Tab-separated files to be classified")      
    parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="Output of the classification")
    parser.add_argument('metadata', type=argparse.FileType('r'), default=None, help="Training metadata (YAML file)")    

    ## Parameters required
    #groupM = parser.add_argument_group('Mandatory')


    # Options group
    groupO = parser.add_argument_group('Optional')
    groupO.add_argument("-S", "--source_tokeniser_path", type=str, help="Source language (SL) tokeniser executable absolute path")
    groupO.add_argument("-T", "--target_tokeniser_path", type=str, help="Target language (TL) tokeniser executable absolute path")
    
    groupO.add_argument('--tmp_dir', default=gettempdir(), help="Temporary directory where creating the temporary files of this program")
    groupO.add_argument('-b', '--block_size', type=int, default=200, help="Sentence pairs per block")
    groupO.add_argument('-p', '--processes', type=int, default=max(1, cpu_count()-1), help="Number of processes to use")
    
    groupO.add_argument('-d', '--discarded_tus', type=argparse.FileType('w'), default=None, help="TSV file with discarded TUs. Discarded TUs by the classifier are written in this file in TSV file.")
    groupO.add_argument('--threshold', type=check_positive_between_zero_and_one, default=0.5, help="Threshold for classifier. If accuracy histogram is present in metadata, the interval for max value will be given as a default instead the current default.")
    groupO.add_argument('--lm_threshold',type=check_positive_between_zero_and_one, default=0.5, help="Threshold for language model fluency scoring. All TUs whose LM fluency score falls below the threshold will are removed (classifier score set to 0), unless the option --keep_lm_result set.")
    groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score and do not discard any TU based on that score.")
    
    # Logging group
    groupL = parser.add_argument_group('Logging')
    groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode')
    groupL.add_argument('--debug', action='store_true', help='Debug logging mode')
    groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file")
    groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit")

    # Validating & parsing
    # Checking if metadata is specified
    args = parser.parse_args()
    logging_setup(args)


    
    try: 
        yamlpath = os.path.dirname(os.path.abspath(args.metadata.name))

        metadata_yaml = yaml.load(args.metadata)      

        args.source_lang=metadata_yaml["source_lang"]
        args.target_lang=metadata_yaml["target_lang"]
        if "source_tokeniser_path" in metadata_yaml:
            args.source_tokeniser_path=metadata_yaml["source_tokeniser_path"]
        if "target_tokeniser_path" in metadata_yaml:
            args.target_tokeniser_path=metadata_yaml["target_tokeniser_path"]        

        try:
            args.clf=joblib.load( os.path.join(yamlpath , metadata_yaml["classifier"]))
        except:            
            args.clf=joblib.load(metadata_yaml["classifier"])
        
#        args.clf.n_jobs = None    
        args.classifier_type=metadata_yaml["classifier_type"]


        try:
            args.dict_sl_tl = ProbabilisticDictionary( os.path.join( yamlpath, metadata_yaml["source_dictionary"]))
        except:
            args.dict_sl_tl = ProbabilisticDictionary(metadata_yaml["source_dictionary"])                
        try:            
            args.dict_tl_sl = ProbabilisticDictionary( os.path.join( yamlpath , metadata_yaml["target_dictionary"]))        
        except:
            args.dict_tl_sl = ProbabilisticDictionary(metadata_yaml["target_dictionary"])        
        
                
        args.normalize_by_length = metadata_yaml["normalize_by_length"]
        args.treat_oovs = metadata_yaml["treat_oovs"]
        args.qmax_limit = metadata_yaml["qmax_limit"]
        args.disable_features_quest = metadata_yaml["disable_features_quest"]
        args.good_examples = metadata_yaml["good_examples"]
        args.wrong_examples = metadata_yaml["wrong_examples"]
        args.good_test_examples = metadata_yaml["good_test_examples"]
        args.wrong_test_examples = metadata_yaml["wrong_test_examples"]
        args.length_ratio = metadata_yaml["length_ratio"]
        args.features_version = 1 if  "features_version" not in metadata_yaml else int(metadata_yaml["features_version"])
        
        threshold = np.argmax(metadata_yaml["accuracy_histogram"])*0.1
        logging.info("Accuracy histogram: {}".format(metadata_yaml["accuracy_histogram"]))
        logging.info("Ideal threshold: {:1.1f}".format(threshold))
        metadata_yaml["threshold"] = threshold
        
        #Load LM stuff if model was trained with it 
        if "source_lm" in metadata_yaml and "target_lm" in metadata_yaml:
            fullpath_source_lm=os.path.join(yamlpath,metadata_yaml['source_lm'])
            if os.path.isfile(fullpath_source_lm):
                args.source_lm= fullpath_source_lm
            else:
                args.source_lm= metadata_yaml['source_lm']
            
            fullpath_target_lm=os.path.join(yamlpath,metadata_yaml['target_lm'])
            if os.path.isfile(fullpath_target_lm):
                args.target_lm=fullpath_target_lm
            else:
                args.target_lm=metadata_yaml['target_lm']
            
            
            args.lm_type=LMType[metadata_yaml['lm_type']]
            stats=DualLMStats( metadata_yaml['clean_mean_perp'],metadata_yaml['clean_stddev_perp'],metadata_yaml['noisy_mean_perp'],metadata_yaml['noisy_stddev_perp'] )
            args.lm_filter_stats=stats
        else:
            args.source_lm=None
            args.target_lm=None
            args.lm_type=None
            args.lm_filter_stats=None
            
            
        
        logging.debug("YAML")
        logging.debug(metadata_yaml)
        parser.set_defaults(**metadata_yaml)   
   
    except:
        print("Error loading metadata")
        traceback.print_exc()
        sys.exit(1)
    
    # Ensure that directory exists; if not, create it
    if not os.path.exists(args.tmp_dir):
        os.makedirs(args.tmp_dir)

    logging.debug("Arguments processed: {}".format(str(args)))
    logging.info("Arguments processed.")
    return args
Ejemplo n.º 7
0
def perform_training(args):
    time_start = default_timer()
    logging.debug("Starting process")
    logging.debug("Running {0} workers at {1} rows per block".format(
        args.processes, args.block_size))

    process_count = max(1, args.processes)
    maxsize = 1000 * process_count

    output_queue = Queue(maxsize=maxsize)
    worker_count = process_count

    #Read input to a named temporary file
    #We may need to read it multiple times and that would be problematic if it is sys.stdin

    count_input_lines = 0
    input = NamedTemporaryFile(mode="w", delete=False)
    for line in args.input:
        input.write(line)
        count_input_lines = count_input_lines + 1
    input.close()

    if count_input_lines < 10000:
        logging.error(
            "Training corpus must be at least 10K sentences long (was {}).".
            format(count_input_lines))
        sys.exit(1)

    # Load dictionaries
    if args.source_word_freqs:
        args.sl_word_freqs = WordZipfFreqDist(args.source_word_freqs)
    if args.target_word_freqs:
        args.tl_word_freqs = WordZipfFreqDistDoubleLinked(
            args.target_word_freqs)
    else:
        args.tl_word_freqs = None

    # Train p**n removal classifier
    train_porn_removal(args)

    stats = None
    with open(input.name) as input_f:
        args.input = input_f
        stats = train_fluency_filter(args)
        input_f.seek(0)

        # Shuffle and get length ratio
        noisy_target_tokenizer = Tokenizer(args.target_tokenizer_command,
                                           args.target_lang)
        total_size, length_ratio, good_sentences, wrong_sentences = build_noisy_set(
            args.input, count_input_lines // 2, count_input_lines // 2,
            args.wrong_examples_file, args.tl_word_freqs,
            noisy_target_tokenizer)
        noisy_target_tokenizer.close()
    os.remove(input.name)

    args.length_ratio = length_ratio

    # Load dictionaries
    args.dict_sl_tl = ProbabilisticDictionary(args.source_dictionary)
    args.dict_tl_sl = ProbabilisticDictionary(args.target_dictionary)

    logging.info("Start computing features.")
    features_file = TemporaryFile('w+')
    # Start reducer
    reduce = Process(target=reduce_process, args=(output_queue, features_file))
    reduce.start()

    # Start workers
    jobs_queue = Queue(maxsize=maxsize)
    workers = []
    for i in range(worker_count):
        worker = Process(target=worker_process,
                         args=(i, jobs_queue, output_queue, args))
        worker.daemon = True  # dies with the parent process
        worker.start()
        workers.append(worker)

    # Mapper process (foreground - parent)
    last_block = map_process(good_sentences, args.block_size, jobs_queue, 1, 0)
    good_sentences.close()

    map_process(wrong_sentences, args.block_size, jobs_queue, 0,
                last_block + 1)
    wrong_sentences.close()

    # Worker termination
    for _ in workers:
        jobs_queue.put(None)

    logging.info("End computing features.")

    for w in workers:
        w.join()

    # Reducer termination
    output_queue.put(None)
    reduce.join()

    features_file.seek(0)

    if args.dump_features:
        logging.info("Dumping features to " +
                     os.path.abspath(args.dump_features.name))
        for i in features_file:
            args.dump_features.write(i)
        args.dump_features.close()
        features_file.seek(0)

    logging.info("Start training.")

    # Use 90% of the input to train and 10% for test
    if args.wrong_examples_file is not None:
        good_examples = int(count_input_lines * 0.9)
        good_examples_test = int(count_input_lines * 0.1)
        wrong_examples = 0
        with args.examples_file as file:
            wrong_examples = sum(1 for line in file)
        wrong_esamples_test = min(good_examples_test,
                                  int(wrong_examples * 0.1))
    else:
        good_examples = int(count_input_lines // 2 * 0.9)
        good_examples_test = int(count_input_lines // 2 * 0.1)
        wrong_examples = good_examples
        wrong_examples_test = good_examples_test

    hgood = []
    hwrong = []
    with TemporaryFile("w+") as features_train, TemporaryFile(
            "w+") as features_test:
        nline = 0
        for line in features_file:
            if nline < good_examples:
                features_train.write(line)
            elif nline < good_examples + good_examples_test:
                features_test.write(line)
            elif nline < good_examples + good_examples_test + wrong_examples:
                features_train.write(line)
            else:
                features_test.write(line)
            nline += 1

        features_train.flush()
        features_test.flush()

        features_train.seek(0)
        features_test.seek(0)
        hgood, hwrong = train_classifier(
            features_train, features_test, args.classifier_type,
            args.classifier,
            Features(None, args.disable_features_quest,
                     args.disable_lang_ident).titles)
        features_train.close()
        features_test.close()

    logging.info("End training.")

    write_metadata(args, length_ratio, hgood, hwrong, stats)
    args.metadata.close()

    # Stats
    logging.info("Finished.")
    elapsed_time = default_timer() - time_start
    logging.info("Elapsed time {:.2f}s.".format(elapsed_time))