Ejemplo n.º 1
0
def initialization():
    logging.info("Processing arguments...")
    # Getting arguments and options with argparse
    # Initialization of the argparse class
    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__)
    # Mandatory parameters
    ## Input file. Try to open it to check if it exists
    parser.add_argument('input', type=argparse.FileType('r'), default=sys.stdin, help="TSV previously classified to extract bad examples")
    parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="Output with the bad examples selected in the process")

    # Options group
    groupO = parser.add_argument_group('Optional')
    groupO.add_argument('--tmp_dir', type=check_if_folder, default=gettempdir(), help="Temporary directory where creating the temporary files of this program")
    groupO.add_argument('-w', '--wrong_examples', type=check_positive_or_zero, default=50000, help="Number of wrong examples")
    groupO.add_argument('--wrong_test_examples', type=check_positive_or_zero, default=2000, help="Number of wrong test examples")
    groupO.add_argument('--threshold', type=check_positive_between_zero_and_one, default=None, help="Threshold for classifier.")

    # Logging group
    groupL = parser.add_argument_group('Logging')
    groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode')
    groupL.add_argument('--debug', action='store_true', help='Debug logging mode')
    groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file")
    groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit")

    # Validating & parsing
    args = parser.parse_args()
    logging_setup(args)

    logging.debug("Arguments processed: {}".format(str(args)))
    logging.info("Arguments processed.")
    return args
Ejemplo n.º 2
0
def main(config_file: str = ConfigOption, version: bool = VersionOption):
    """
    This is the entry point of your command line application. The values of the CLI params that
    are passed to this application will show up als parameters to this function.

    This docstring is where you describe what your command line application does.
    Try running `python -m {{ cookiecutter.module_name }} --help` to see how this shows up in the command line.
    """
    {% if cookiecutter.config_file != 'none' %}config = util.load_config(config_file)
    util.logging_setup(config){% endif %}
    logger.info("Looks like you're all set up. Let's get going!")
Ejemplo n.º 3
0
def initialization():
    global logging_level

    # Validating & parsing arguments
    parser, groupO, _ = argument_parser()
    args = parser.parse_args()

    # Set up logging
    logging_setup(args)
    logging_level = logging.getLogger().level
    import tensorflow as tf

    # Set number of processes to be used by TensorFlow
    tf.config.threading.set_intra_op_parallelism_threads(args.processes)
    tf.config.threading.set_inter_op_parallelism_threads(args.processes)

    # Load metadata YAML
    args = load_metadata(args, parser)

    return args
Ejemplo n.º 4
0
def initialization():
    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__)

    parser.add_argument('input',  nargs='?', type=argparse.FileType('r'), default=sys.stdin,  help="Tab-separated bilingual input file")

    groupM = parser.add_argument_group("Mandatory")
    groupM.add_argument('-m', '--metadata', type=argparse.FileType('w'), required=True, help="Training metadata (YAML file)")
    groupM.add_argument('-c', '--classifier', type=argparse.FileType('wb'), required=True, help="Classifier data file")
    groupM.add_argument('-s', '--source_lang',  required=True, help="Source language code")
    groupM.add_argument('-t', '--target_lang', required=True, help="Target language code")
    groupM.add_argument('-d', '--source_dictionary',  type=argparse.FileType('r'), required=True, help="LR gzipped probabilistic dictionary")
    groupM.add_argument('-D', '--target_dictionary', type=argparse.FileType('r'), required=True, help="RL gzipped probabilistic dictionary")

    groupO = parser.add_argument_group('Options')
    groupO.add_argument('--normalize_by_length', action='store_true', help="Normalize by length in qmax dict feature")
    groupO.add_argument('--treat_oovs', action='store_true', help="Special treatment for OOVs in qmax dict feature")
    groupO.add_argument('--qmax_limit', type=check_positive_or_zero, default=20, help="Number of max target words to be taken into account, sorted by length")
    groupO.add_argument('--disable_features_quest', action='store_false', help="Disable less important features")
    groupO.add_argument('-g', '--good_examples',  type=check_positive_or_zero, default=50000, help="Number of good examples")
    groupO.add_argument('-w', '--wrong_examples', type=check_positive_or_zero, default=50000, help="Number of wrong examples")
    groupO.add_argument('--good_test_examples',  type=check_positive_or_zero, default=2000, help="Number of good test examples")
    groupO.add_argument('--wrong_test_examples', type=check_positive_or_zero, default=2000, help="Number of wrong test examples")
    groupO.add_argument('--classifier_type', choices=['svm', 'nn', 'nn1', 'adaboost', 'random_forest'], default="svm", help="Classifier type")
    groupO.add_argument('--dump_features', type=argparse.FileType('w'), default=None, help="Dump training features to file")
    groupO.add_argument('-b', '--block_size', type=check_positive, default=10000, help="Sentence pairs per block")
    groupO.add_argument('-p', '--processes', type=check_positive, default=max(1, cpu_count()-1), help="Number of process to use")
    groupO.add_argument('--wrong_examples_file', type=argparse.FileType('r'), default=None, help="File with wrong examples extracted to replace the synthetic examples from method used by default")

    groupL = parser.add_argument_group('Logging')
    groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode')
    groupL.add_argument('--debug', action='store_true', help='Debug logging mode')
    groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file")

    args = parser.parse_args()
    # Logging
    logging_setup(args)

    return args
Ejemplo n.º 5
0
def initialization():
  logging.info("Processing arguments...")
  # Getting arguments and options with argparse
  # Initialization of the argparse class
  parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__)

  # Mandatory parameters
  parser.add_argument('input', type=argparse.FileType('r'), default=None, help="File to be anonymized")
  parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="File with anonymization annotations")
  parser.add_argument("srclang", type=str, help="Source language (SL) of the input")
  parser.add_argument("trglang", type=str, help="Target language (TL) of the input")
  
  ## Parameters required
  groupM = parser.add_argument_group('Mandatory')
  groupM.add_argument("--format", choices=["tmx", "cols"], required=True, type=str, help="Input file format. Values: cols, tmx")
  
  # Options group
  groupO = parser.add_argument_group('Optional')
  groupO.add_argument('--tmp_dir', default=gettempdir(), help="Temporary directory where creating the temporary files of this program")
  groupM.add_argument("--core", default=0, type=int, help="GPU id")

  # Logging group
  groupL = parser.add_argument_group('Logging')
  groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode')
  groupL.add_argument('--debug', action='store_true', help='Debug logging mode')
  groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file")
  groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit")

  # Validating & parsing
  args = parser.parse_args()
  util.logging_setup(args)
  logging.debug("Arguments processed: {}".format(str(args)))
  if args.format=="tmx" and args.input.name=="<stdin>":
    logging.error("Cannot process TMX from standard input.")
    sys.exit(1)
  logging.info("Arguments processed.")
  return args
Ejemplo n.º 6
0
def initialization():
    logging.info("Processing arguments...")
    # Getting arguments and options with argparse
    # Initialization of the argparse class
    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__)
    # Mandatory parameters
    ## Input file. Try to open it to check if it exists
    parser.add_argument('input', type=argparse.FileType('rt'), default=None, help="Tab-separated files to be classified")
    parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="Output of the classification")

    ## Parameters required
    groupM = parser.add_argument_group('Mandatory')
    groupM.add_argument('-m', '--metadata', type=argparse.FileType('r'), required=True, help="Training metadata (YAML file). Take into account that explicit command line arguments will overwrite the values from metadata file")

    # Options group
    groupO = parser.add_argument_group('Optional')
    groupO.add_argument("-s", "--source_lang", type=str, help="Source language (SL) of the input")
    groupO.add_argument("-t", "--target_lang", type=str, help="Target language (TL) of the input")
    groupO.add_argument('--tmp_dir', default=gettempdir(), help="Temporary directory where creating the temporary files of this program")
    groupO.add_argument('-b', '--block_size', type=int, default=10000, help="Sentence pairs per block")
    groupO.add_argument('-p', '--processes', type=int, default=max(1, cpu_count()-1), help="Number of processes to use")
    groupO.add_argument('--normalize_by_length', action='store_true', help="Normalize by length in qmax dict feature")
    groupO.add_argument('--treat_oovs', action='store_true', help="Special treatment for OOVs in qmax dict feature")
    groupO.add_argument('--qmax_limit', type=check_positive_or_zero, default=20, help="Number of max target words to be taken into account, sorted by length")    
    groupO.add_argument('--disable_features_quest', action='store_false', help="Disable less important features")
    groupO.add_argument('-g', '--good_examples',  type=check_positive_or_zero, default=50000, help="Number of good examples")
    groupO.add_argument('-w', '--wrong_examples', type=check_positive_or_zero, default=50000, help="Number of wrong examples")
    groupO.add_argument('--good_test_examples',  type=check_positive_or_zero, default=2000, help="Number of good test examples")
    groupO.add_argument('--wrong_test_examples', type=check_positive_or_zero, default=2000, help="Number of wrong test examples")
    groupO.add_argument('-d', '--discarded_tus', type=argparse.FileType('w'), default=None, help="TSV file with discarded TUs. Discarded TUs by the classifier are written in this file in TSV file.")
    groupO.add_argument('--threshold', type=check_positive_between_zero_and_one, default=0.5, help="Threshold for classifier. If accuracy histogram is present in metadata, the interval for max value will be given as a default instead the current default.")
    
    # Logging group
    groupL = parser.add_argument_group('Logging')
    groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode')
    groupL.add_argument('--debug', action='store_true', help='Debug logging mode')
    groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file")
    groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit")

    # Validating & parsing
    # Checking if metadata is specified
    preliminary_args = parser.parse_args()
    if preliminary_args.metadata != None:
        # If so, we load values from metadata
        metadata_yaml = yaml.load(preliminary_args.metadata)
        threshold = np.argmax(metadata_yaml["accuracy_histogram"])*0.1
        logging.info("Accuracy histogram: {}".format(metadata_yaml["accuracy_histogram"]))
        logging.info("Ideal threshold: {:1.1f}".format(threshold))
        metadata_yaml["threshold"] = threshold
        logging.debug("YAML")
        logging.debug(metadata_yaml)
        parser.set_defaults(**metadata_yaml)
    # Then we build again the parameters to overwrite the metadata values if their options were explicitly specified in command line arguments
    args = parser.parse_args()
    logging_setup(args)
    
    # Extra-checks for args here
    # Load dictionaries
    args.dict_sl_tl = ProbabilisticDictionary(args.source_dictionary)
    args.dict_tl_sl = ProbabilisticDictionary(args.target_dictionary)
    # Load classifier
    args.clf = joblib.load(args.classifier)

    # Ensure that directory exists; if not, create it
    if not os.path.exists(args.tmp_dir):
        os.makedirs(args.tmp_dir)

    logging.debug("Arguments processed: {}".format(str(args)))
    logging.info("Arguments processed.")
    return args
def initialization():
    global logging_level

    logging.info("Processing arguments...")
    # Getting arguments and options with argparse
    # Initialization of the argparse class
    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=__doc__)
    # Mandatory parameters
    ## Input file. Try to open it to check if it exists
    parser.add_argument('input',
                        type=argparse.FileType('rt'),
                        default=None,
                        help="Tab-separated files to be classified")
    parser.add_argument('output',
                        nargs='?',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help="Output of the classification")
    parser.add_argument('metadata',
                        type=argparse.FileType('r'),
                        default=None,
                        help="Training metadata (YAML file)")

    ## Parameters required
    #groupM = parser.add_argument_group('Mandatory')

    # Options group
    groupO = parser.add_argument_group('Optional')
    groupO.add_argument("-S",
                        "--source_tokenizer_command",
                        type=str,
                        help="Source language (SL) tokenizer full command")
    groupO.add_argument("-T",
                        "--target_tokenizer_command",
                        type=str,
                        help="Target language (TL) tokenizer full command")

    groupO.add_argument("--scol",
                        default=3,
                        type=check_positive,
                        help="Source sentence column (starting in 1)")
    groupO.add_argument("--tcol",
                        default=4,
                        type=check_positive,
                        help="Target sentence column (starting in 1)")

    groupO.add_argument(
        '--tmp_dir',
        default=gettempdir(),
        help=
        "Temporary directory where creating the temporary files of this program"
    )
    groupO.add_argument('-b',
                        '--block_size',
                        type=int,
                        default=200,
                        help="Sentence pairs per block")
    groupO.add_argument('-p',
                        '--processes',
                        type=int,
                        default=max(1,
                                    cpu_count() - 1),
                        help="Number of processes to use")

    groupO.add_argument(
        '-d',
        '--discarded_tus',
        type=argparse.FileType('w'),
        default=None,
        help=
        "TSV file with discarded TUs. Discarded TUs by the classifier are written in this file in TSV file."
    )
    groupO.add_argument(
        '--lm_threshold',
        type=check_positive_between_zero_and_one,
        default=0.5,
        help=
        "Threshold for language model fluency scoring. All TUs whose LM fluency score falls below the threshold will are removed (classifier score set to 0)"
    )
    #groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score and do not discard any TU based on that score.")

    groupO.add_argument(
        '--score_only',
        action='store_true',
        help="Only output one column which is the bicleaner score",
        default=False)
    groupO.add_argument(
        '--disable_hardrules',
        action='store_true',
        help=
        "Disables the bicleaner_hardrules filtering (only bicleaner_classify is applied)"
    )
    groupO.add_argument('--disable_lm_filter',
                        action='store_true',
                        help="Disables LM filtering")
    groupO.add_argument('--disable_porn_removal',
                        default=False,
                        action='store_true',
                        help="Don't apply p**n removal")
    groupO.add_argument('--disable_minimal_length',
                        default=False,
                        action='store_true',
                        help="Don't apply minimal length rule")

    # Logging group
    groupL = parser.add_argument_group('Logging')
    groupL.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Silent logging mode')
    groupL.add_argument('--debug',
                        action='store_true',
                        help='Debug logging mode')
    groupL.add_argument('--logfile',
                        type=argparse.FileType('a'),
                        default=sys.stderr,
                        help="Store log to a file")
    groupL.add_argument('-v',
                        '--version',
                        action='version',
                        version="%(prog)s " + __version__,
                        help="show version of this script and exit")

    # Validating & parsing
    # Checking if metadata is specified
    args = parser.parse_args()
    logging_setup(args)

    logging_level = logging.getLogger().level

    try:
        metadata_yaml = yaml.safe_load(args.metadata)
        yamlpath = os.path.dirname(os.path.abspath(args.metadata.name))
        metadata_yaml["yamlpath"] = yamlpath

        args.source_lang = metadata_yaml["source_lang"]
        args.target_lang = metadata_yaml["target_lang"]
        if "source_tokenizer_command" in metadata_yaml:
            args.source_tokenizer_command = metadata_yaml[
                "source_tokenizer_command"]
        if "target_tokenizer_command" in metadata_yaml:
            args.target_tokenizer_command = metadata_yaml[
                "target_tokenizer_command"]

        try:
            args.clf = joblib.load(
                os.path.join(yamlpath, metadata_yaml["classifier"]))
        except:
            args.clf = joblib.load(metadata_yaml["classifier"])

#        args.clf.n_jobs = None
        args.classifier_type = metadata_yaml["classifier_type"]

        try:
            args.dict_sl_tl = ProbabilisticDictionary(
                os.path.join(yamlpath, metadata_yaml["source_dictionary"]))
        except:
            args.dict_sl_tl = ProbabilisticDictionary(
                metadata_yaml["source_dictionary"])
        try:
            args.dict_tl_sl = ProbabilisticDictionary(
                os.path.join(yamlpath, metadata_yaml["target_dictionary"]))
        except:
            args.dict_tl_sl = ProbabilisticDictionary(
                metadata_yaml["target_dictionary"])

        try:
            args.sl_word_freqs = WordZipfFreqDist(
                os.path.join(yamlpath, metadata_yaml["source_word_freqs"]))
        except:
            try:
                args.sl_word_freqs = WordZipfFreqDist(
                    metadata_yaml["source_word_freqs"])
            except:
                args.sl_word_freqs = None
        try:
            args.tl_word_freqs = WordZipfFreqDist(
                os.path.join(yamlpath, metadata_yaml["target_word_freqs"]))
        except:
            try:
                args.tl_word_freqs = WordZipfFreqDist(
                    metadata_yaml["target_word_freqs"])
            except:
                args.tl_word_freqs = None

        args.normalize_by_length = metadata_yaml["normalize_by_length"]
        args.treat_oovs = metadata_yaml["treat_oovs"]
        args.qmax_limit = metadata_yaml["qmax_limit"]
        args.disable_features_quest = metadata_yaml["disable_features_quest"]
        args.length_ratio = metadata_yaml["length_ratio"]
        args.features_version = 1 if "features_version" not in metadata_yaml else int(
            metadata_yaml["features_version"])

        threshold = np.argmax(metadata_yaml["accuracy_histogram"]) * 0.1
        logging.info("Accuracy histogram: {}".format(
            metadata_yaml["accuracy_histogram"]))
        logging.info("Ideal threshold: {:1.1f}".format(threshold))
        metadata_yaml["threshold"] = threshold

        #Try loading metadata for LM filtering
        if not args.disable_lm_filter:
            if not ("source_lm" in metadata_yaml
                    and "target_lm" in metadata_yaml):
                args.disable_lm_filter = True
                logging.warning(
                    "LM filter not present in metadata, disabling.")
        else:
            logging.info("LM filtering disabled")

        if not args.disable_porn_removal:
            if not ("porn_removal_file" in metadata_yaml
                    and "porn_removal_side" in metadata_yaml):
                args.disable_porn_removal = True
                logging.warning(
                    "P**n removal not present in metadata, disabling.")
            else:
                try:
                    args.porn_removal = fasttext.load_model(
                        os.path.join(yamlpath,
                                     metadata_yaml['porn_removal_file']))
                except:
                    args.porn_removal = fasttext.load_model(
                        args.metadata_yaml['porn_removal_file'])
        else:
            logging.info("P**n removal disabled")

        if "disable_lang_ident" in metadata_yaml:
            args.disable_lang_ident = metadata_yaml["disable_lang_ident"]
        else:
            args.disable_lang_ident = False

        logging.debug("YAML")
        logging.debug(metadata_yaml)
        args.metadata_yaml = metadata_yaml
        parser.set_defaults(**metadata_yaml)

    except:
        logging.error("Error loading metadata")
        traceback.print_exc()
        sys.exit(1)

    # Ensure that directory exists; if not, create it
    if not os.path.exists(args.tmp_dir):
        os.makedirs(args.tmp_dir)

    logging.debug("Arguments processed: {}".format(str(args)))
    logging.info("Arguments processed.")
    return args
Ejemplo n.º 8
0
def initialization():
    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=__doc__)

    parser.add_argument('input',
                        nargs='?',
                        type=argparse.FileType('r'),
                        default=sys.stdin,
                        help="Tab-separated bilingual input file")

    groupM = parser.add_argument_group("Mandatory")
    groupM.add_argument('-m',
                        '--metadata',
                        type=argparse.FileType('w'),
                        required=True,
                        help="Training metadata (YAML file)")
    groupM.add_argument('-c',
                        '--classifier',
                        type=argparse.FileType('wb'),
                        required=True,
                        help="Classifier data file")
    groupM.add_argument('-s',
                        '--source_lang',
                        required=True,
                        help="Source language code")
    groupM.add_argument('-t',
                        '--target_lang',
                        required=True,
                        help="Target language code")
    groupM.add_argument('-d',
                        '--source_dictionary',
                        type=argparse.FileType('r'),
                        required=True,
                        help="LR gzipped probabilistic dictionary")
    groupM.add_argument('-D',
                        '--target_dictionary',
                        type=argparse.FileType('r'),
                        required=True,
                        help="RL gzipped probabilistic dictionary")

    groupO = parser.add_argument_group('Options')
    groupO.add_argument('-S',
                        '--source_tokeniser_path',
                        help="Source language tokeniser absolute path")
    groupO.add_argument('-T',
                        '--target_tokeniser_path',
                        help="Target language tokeniser absolute path")
    groupO.add_argument('--normalize_by_length',
                        action='store_true',
                        help="Normalize by length in qmax dict feature")
    groupO.add_argument('--treat_oovs',
                        action='store_true',
                        help="Special treatment for OOVs in qmax dict feature")
    groupO.add_argument(
        '--qmax_limit',
        type=check_positive_or_zero,
        default=20,
        help=
        "Number of max target words to be taken into account, sorted by length"
    )
    groupO.add_argument('--disable_features_quest',
                        action='store_false',
                        help="Disable less important features")
    groupO.add_argument('-g',
                        '--good_examples',
                        type=check_positive_or_zero,
                        default=50000,
                        help="Number of good examples")
    groupO.add_argument('-w',
                        '--wrong_examples',
                        type=check_positive_or_zero,
                        default=50000,
                        help="Number of wrong examples")
    groupO.add_argument('--good_test_examples',
                        type=check_positive_or_zero,
                        default=10000,
                        help="Number of good test examples")
    groupO.add_argument('--wrong_test_examples',
                        type=check_positive_or_zero,
                        default=10000,
                        help="Number of wrong test examples")
    groupO.add_argument(
        '--classifier_type',
        choices=['svm', 'nn', 'nn1', 'adaboost', 'random_forest'],
        default="random_forest",
        help="Classifier type")
    groupO.add_argument('--dump_features',
                        type=argparse.FileType('w'),
                        default=None,
                        help="Dump training features to file")
    groupO.add_argument(
        '--wrong_examples_file',
        type=argparse.FileType('r'),
        default=None,
        help=
        "File with wrong examples extracted to replace the synthetic examples from method used by default"
    )
    groupO.add_argument('--features_version',
                        type=check_positive,
                        default=FEATURES_VERSION,
                        help="Version of the features")

    # For LM filtering
    groupO.add_argument(
        '--noisy_examples_file_sl',
        type=str,
        help=
        "File with noisy text in the SL. These are used to estimate the perplexity of noisy text."
    )
    groupO.add_argument(
        '--noisy_examples_file_tl',
        type=str,
        help=
        "File with noisy text in the TL. These are used to estimate the perplexity of noisy text."
    )
    groupO.add_argument(
        '--lm_dev_size',
        type=check_positive_or_zero,
        default=2000,
        help=
        "Number of sentences to be removed from clean text before training LMs. These are used to estimate the perplexity of clean text."
    )
    groupO.add_argument('--lm_file_sl',
                        type=str,
                        help="SL language model output file.")
    groupO.add_argument('--lm_file_tl',
                        type=str,
                        help="TL language model output file.")
    groupO.add_argument(
        '--lm_training_file_sl',
        type=str,
        help=
        "SL text from which the SL LM is trained. If this parameter is not specified, SL LM is trained from the SL side of the input file, after removing --lm_dev_size sentences."
    )
    groupO.add_argument(
        '--lm_training_file_tl',
        type=str,
        help=
        "TL text from which the TL LM is trained. If this parameter is not specified, TL LM is trained from the TL side of the input file, after removing --lm_dev_size sentences."
    )
    groupO.add_argument(
        '--lm_clean_examples_file_sl',
        type=str,
        help=
        "File with clean text in the SL. Used to estimate the perplexity of clean text. This option must be used together with --lm_training_file_sl and both files must not have common sentences. This option replaces --lm_dev_size."
    )
    groupO.add_argument(
        '--lm_clean_examples_file_tl',
        type=str,
        help=
        "File with clean text in the TL. Used to estimate the perplexity of clean text. This option must be used together with --lm_training_file_tl and both files must not have common sentences. This option replaces --lm_dev_size."
    )

    groupL = parser.add_argument_group('Logging')
    groupL.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Silent logging mode')
    groupL.add_argument('--debug',
                        action='store_true',
                        help='Debug logging mode')
    groupL.add_argument('--logfile',
                        type=argparse.FileType('a'),
                        default=sys.stderr,
                        help="Store log to a file")

    args = parser.parse_args()
    # Logging
    logging_setup(args)
    return args
        logging.debug("Return gzipped")
        #f_dict.close()
        #afterpruning_dict.close()
        #afterpruning_dict.seek(0)
        with open(temp_file_name, 'rb') as ngzd:
            with gzip.open(f_dict, 'wb') as gzd:
                shutil.copyfileobj(ngzd, gzd)
    else:
        logging.debug("Not gzipped")
        #f_dict.close()
        with open(temp_file_name, 'r') as ngzd:
            with open(f_dict.name, 'wb') as gzd:
                shutil.copyfile(temp_file_name, f_dict.name)


#                for i in ngzd:
#                    gzd.write(i)
#       f_dict.close()

if __name__ == '__main__':
    try:
        logging_setup()
        args = initialization()  # Parsing parameters
        logging_setup(args)
        main(args)  # Running main program
        logging.info("Program finished")
    except Exception as ex:
        tb = traceback.format_exc()
        logging.error(tb)
        sys.exit(1)
def initialization():
    # Getting arguments and options with argparse
    # Initialization of the argparse class
    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=__doc__)
    # Mandatory parameters
    ## Input file. Try to open it to check if it exists
    parser.add_argument(
        'input',
        type=argparse.FileType('r'),
        default=None,
        help=
        "Configuration file. Must contain a pair of freq_path dict_path in each line."
    )
    ## Output file. Try to open it to check if it exists or can be created
    parser.add_argument('output',
                        type=argparse.FileType('wb+'),
                        default=None,
                        help="Merged probabilistic dictionary.")
    parser.add_argument('--stopwords',
                        type=argparse.FileType('w+'),
                        default="stopwords",
                        help="File with stopwords",
                        required=False)
    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version="%(prog)s " + __version__,
                        help="show version of this script and exit")
    parser.add_argument('-g',
                        '--gzipped',
                        action='store_true',
                        help="Compresses the output file")
    ## Parameters required
    #groupM = parser.add_argument_group('mandatory arguments')
    #groupM.add_argument('-s', '--source_lang', required=True, help="Source language of the input")
    #groupM.add_argument('-t', '--target_lang', required=True, help="Target language of the input")

    # Options group
    groupO = parser.add_argument_group('options')
    groupO.add_argument('-s',
                        '--stopwords_amount',
                        type=int,
                        default=0,
                        help="Amount of words to mark as stopwords")
    groupO.add_argument(
        '-n',
        '--prune_ratio',
        type=float,
        default=10,
        help=
        "Ratio to prune the dictionary. Translations whose probability is {} times (default) than the maximum one."
        .format(10))
    groupO.add_argument(
        '-f',
        '--cutoff_freq',
        type=int,
        default=1,
        help=
        "Cutoff frequency for merged dictionary (all those equal or below are removed)"
    )
    groupO.add_argument(
        '-k',
        '--keep_tmp',
        action='store_true',
        default=False,
        help="This flag specifies whether removing temporal folder or not")
    groupO.add_argument(
        '-m',
        '--tmp_dir',
        type=check_if_folder,
        default=gettempdir(),
        help=
        "Temporary directory where creating the temporary files of this program"
    )

    # Logging group
    groupL = parser.add_argument_group('logging')
    groupL.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Silent logging mode')
    groupL.add_argument('--debug',
                        action='store_true',
                        help='Debug logging mode')
    groupL.add_argument('--logfile',
                        type=argparse.FileType('a'),
                        default=sys.stderr,
                        help="Store log to a file")

    # Validating & parsing
    args = parser.parse_args()
    logging_setup(args)

    # Extra-checks for args here
    if (args.prune_ratio != 0):
        args.prune_ratio = math.log(args.prune_ratio)

    return args
Ejemplo n.º 11
0
def initialization():
    # Getting arguments and options with argparse
    # Initialization of the argparse class
    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=__doc__)
    # Mandatory parameters
    ##Dictionary file
    parser.add_argument(
        'dictionary',
        type=argparse.FileType('r'),
        default=None,
        help="Dictionary file. Line format: Target Source Prob")
    ## Output file. Try to open it to check if it exists or can be created
    parser.add_argument('output',
                        type=argparse.FileType('wb+'),
                        default=None,
                        help="Pruned probabilistic dictionary.")
    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version="%(prog)s " + __version__,
                        help="show version of this script and exit")
    parser.add_argument('-g',
                        '--gzipped',
                        action='store_true',
                        help="Compresses the output file")

    # Options group
    groupO = parser.add_argument_group('options')
    groupO.add_argument(
        '-n',
        '--prune_ratio',
        type=float,
        default=10,
        help=
        "Ratio to prune the dictionary. Translations whose probability is {} times (default) than the maximum one."
        .format(10))
    groupO.add_argument(
        '-k',
        '--keep_tmp',
        action='store_true',
        default=False,
        help="This flag specifies whether removing temporal folder or not")
    groupO.add_argument(
        '-m',
        '--tmp_dir',
        type=check_if_folder,
        default=gettempdir(),
        help=
        "Temporary directory where creating the temporary files of this program"
    )

    # Logging group
    groupL = parser.add_argument_group('logging')
    groupL.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Silent logging mode')
    groupL.add_argument('--debug',
                        action='store_true',
                        help='Debug logging mode')
    groupL.add_argument('--logfile',
                        type=argparse.FileType('a'),
                        default=sys.stderr,
                        help="Store log to a file")

    # Validating & parsing
    args = parser.parse_args()
    logging_setup(args)

    # Extra-checks for args here
    if (args.prune_ratio != 0):
        args.prune_ratio = math.log(args.prune_ratio)

    return args
def initialization():
    logging.info("Processing arguments...")
    # Getting arguments and options with argparse
    # Initialization of the argparse class
    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=__doc__)
    # Mandatory parameters
    ## Input file. Try to open it to check if it exists
    parser.add_argument('input',
                        type=argparse.FileType('r'),
                        default=sys.stdin,
                        help="Tab-separated bilingual input file")
    parser.add_argument(
        '-o',
        '--output_dir',
        required=True,
        type=str,
        default=os.getcwd(),
        help=
        "Output directory. Cleaned corpus and dictionary will be created here. Folder will be created if not exists"
    )
    parser.add_argument(
        '--giza',
        required=True,
        type=str,
        help=
        "GIZA++ folder path, which contains binaries. Expected scripts in the folder: {}, {} and {}"
        .format(GIZA_MKCLS, GIZA_MGIZA, GIZA_SNT2COOC))
    parser.add_argument(
        '--moses_dir',
        required=True,
        type=str,
        help="Moses scripts folder path, which contains the script {}".format(
            TRAIN_MODEL_SCRIPT))

    ## Parameters required
    groupM = parser.add_argument_group('Mandatory')
    groupM.add_argument('-s',
                        '--source_lang',
                        required=True,
                        type=str,
                        help="Source language of the input")
    groupM.add_argument('-t',
                        '--target_lang',
                        required=True,
                        type=str,
                        help="Target language of the input")

    # Options group
    groupO = parser.add_argument_group('Optional')
    groupO.add_argument(
        '-m',
        '--tmp_dir',
        type=check_if_folder,
        default=gettempdir(),
        help=
        "Temporary directory where creating the temporary files of this program"
    )
    groupO.add_argument('-b',
                        '--block_size',
                        type=int,
                        default=10000,
                        help="Sentence pairs per block")
    groupO.add_argument('-p',
                        '--processes',
                        type=int,
                        default=max(1,
                                    cpu_count() - 1),
                        help="Number of processes to use")
    groupO.add_argument(
        '-r',
        '--giza_ratio',
        type=float,
        default=9,
        help="9-1 Sentence ratio limit of GIZA++ (it shouldn't be modified)")
    groupO.add_argument(
        '-n',
        '--prune_ratio',
        type=float,
        default=10,
        help=
        "Ratio to prune the dictionary. Translations whose probability is {} times (default) than the maximum one."
        .format(10))
    groupO.add_argument('--min',
                        type=int,
                        default=1,
                        help="Minimum number of tokens allowed for a sentence")
    groupO.add_argument('--max',
                        type=int,
                        default=50,
                        help="Maximum number of tokens allowed for a sentence")

    # Logging group
    groupL = parser.add_argument_group('Logging')
    groupL.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Silent logging mode')
    groupL.add_argument('--debug',
                        action='store_true',
                        help='Debug logging mode')
    groupL.add_argument('--logfile',
                        type=argparse.FileType('a'),
                        default=sys.stderr,
                        help="Store log to a file")
    groupL.add_argument('-v',
                        '--version',
                        action='version',
                        version="%(prog)s " + __version__,
                        help="show version of this script and exit")

    # Validating & parsing
    args = parser.parse_args()
    logging_setup(args)

    # Extra-checks for args here
    args.prune_ratio = math.log(args.prune_ratio)
    if not args.output_dir.endswith("/"):
        args.output_dir = args.output_dir + "/"
    if not args.moses_dir.endswith("/"):
        args.moses_dir = args.moses_dir + "/"
    args.output_moses_corpus = args.output_dir + CLEAN_OUTPUT
    args.moses_train_script = args.moses_dir + TRAIN_MODEL_SCRIPT

    # Checking if moses scripts exist before running previous processes
    if not os.path.isdir(os.path.expanduser(args.output_dir)):
        logging.info("The output folder {} doesn't exist. Creating...".format(
            args.output_dir))
        os.makedirs(os.path.expanduser(args.output_dir))
        logging.info("Output folder created at {}".format(args.output_dir))
    if not os.path.isfile(args.moses_train_script):
        raise argparse.ArgumentTypeError(
            "Moses script {} cannot be found in path {}".format(
                TRAIN_MODEL_SCRIPT, args.moses_train_script))
    if not os.path.isfile(args.giza + GIZA_MGIZA) or not os.path.isfile(
            args.giza + GIZA_MKCLS) or not os.path.isfile(args.giza +
                                                          GIZA_SNT2COOC):
        raise argparse.ArgumentTypeError(
            "Necessary GIZA++ scripts cannot be found in path {}. Please check if some of the following scripts are missing in the folder: {}, {} and {}"
            .format(args.giza, GIZA_MGIZA, GIZA_MKCLS, GIZA_SNT2COOC))
    # Intermediary files
    args.output_source = open(
        "{}{}".format(args.output_dir, CLEAN_OUTPUT + "." + args.source_lang),
        "w")
    args.output_target = open(
        "{}{}".format(args.output_dir, CLEAN_OUTPUT + "." + args.target_lang),
        "w")

    # Final dicts names
    args.dict_sl_tl_final = "{}{}".format(
        args.output_dir,
        DICT_FINAL_NAME.format(args.source_lang, args.target_lang))
    args.dict_tl_sl_final = "{}{}".format(
        args.output_dir,
        DICT_FINAL_NAME.format(args.target_lang, args.source_lang))

    logging.debug("Arguments processed: {}".format(str(args)))
    logging.info("Arguments processed.")
    return args
Ejemplo n.º 13
0
def initialization():
    global logging_level

    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=__doc__)
    parser.add_argument('input',
                        nargs='?',
                        type=argparse.FileType('rt', errors="replace"),
                        default=io.TextIOWrapper(sys.stdin.buffer,
                                                 errors="replace"),
                        help="Tab-separated bilingual tagged file")
    parser.add_argument('output',
                        nargs='?',
                        type=argparse.FileType('wt'),
                        default=sys.stdout,
                        help="Output of the classification")
    parser.add_argument(
        '--annotated_output',
        default=False,
        action='store_true',
        help=
        "Adds an extra column with each sentence's evaluation (\"keep\" if the sentence is good, otherwise the reason for rejecting"
    )

    #groupM = parser.add_argument_group('Mandatory')
    #groupM.add_argument("-s", "--source_lang", type=str, required=True, help="Source language (SL) of the input")
    #groupM.add_argument("-t", "--target_lang", type=str, required=True, help="Target language (TL) of the input")

    groupO = parser.add_argument_group('Optional')
    groupO.add_argument(
        '--tmp_dir',
        default=gettempdir(),
        help=
        "Temporary directory where creating the temporary files of this program"
    )
    groupO.add_argument('-b',
                        '--block_size',
                        type=int,
                        default=10000,
                        help="Sentence pairs per block")
    groupO.add_argument('-p',
                        '--processes',
                        type=int,
                        default=max(1,
                                    cpu_count() - 1),
                        help="Number of processes to use")

    groupO.add_argument('--disable_lang_ident',
                        default=False,
                        action='store_true',
                        help="Don't apply rules that use language detecting")
    groupO.add_argument('--disable_minimal_length',
                        default=False,
                        action='store_true',
                        help="Don't apply minimal length rule")
    groupO.add_argument('--disable_porn_removal',
                        default=False,
                        action='store_true',
                        help="Don't apply p**n removal")

    groupO.add_argument("-s",
                        "--source_lang",
                        type=str,
                        default=None,
                        help="Source language (SL) of the input")
    groupO.add_argument("-t",
                        "--target_lang",
                        type=str,
                        default=None,
                        help="Target language (TL) of the input")

    groupO.add_argument("--scol",
                        default=1,
                        type=check_positive,
                        help="Source sentence column (starting in 1)")
    groupO.add_argument("--tcol",
                        default=2,
                        type=check_positive,
                        help="Target sentence column (starting in 1)")

    groupO.add_argument("-S",
                        "--source_tokenizer_command",
                        default=None,
                        type=str,
                        help="Source language (SL) tokenizer full command")
    groupO.add_argument("-T",
                        "--target_tokenizer_command",
                        default=None,
                        type=str,
                        help="Target language (TL) tokenizer full command")

    #LM  filtering
    groupO.add_argument('--disable_lm_filter',
                        default=False,
                        action='store_true',
                        help="Don't apply LM filtering")
    groupO.add_argument('--metadata',
                        type=argparse.FileType('r'),
                        default=None,
                        help="Training metadata (YAML file)")
    groupO.add_argument('--lm_threshold',
                        type=check_positive_between_zero_and_one,
                        default=0.5,
                        help="Threshold for language model fluency scoring.")
    #groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score.")

    # Logging group
    groupL = parser.add_argument_group('Logging')
    groupL.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Silent logging mode')
    groupL.add_argument('--debug',
                        action='store_true',
                        help='Debug logging mode')
    groupL.add_argument('--logfile',
                        type=argparse.FileType('a'),
                        default=sys.stderr,
                        help="Store log to a file")
    #groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit")

    args = parser.parse_args()
    logging_setup(args)

    logging_level = logging.getLogger().level

    # Ensure that directory exists; if not, create it
    if not os.path.exists(args.tmp_dir):
        os.makedirs(args.tmp_dir)

    #Try loading metadata for LM filtering and p**n removal
    if not (args.disable_lm_filter
            and args.disable_porn_removal) and args.metadata != None:
        logging.info("Loading metadata info")

        try:
            args.metadata_yaml = yaml.safe_load(args.metadata)
            args.metadata_yaml["yamlpath"] = os.path.dirname(
                os.path.abspath(args.metadata.name))

            if not ("source_lm" in args.metadata_yaml
                    and "target_lm" in args.metadata_yaml):
                args.disable_lm_filter = True
                logging.warning("LM file not present in metadata.")
            if not ("porn_removal_file" in args.metadata_yaml):
                args.disable_porn_removal = True
                logging.warning(
                    "P**n removal classifier not present in metadata.")
            else:
                try:
                    args.porn_removal = fasttext.load_model(
                        os.path.join(args.metadata_yaml["yamlpath"],
                                     args.metadata_yaml['porn_removal_file']))
                except:
                    args.porn_removal = fasttext.load_model(
                        args.metadata_yaml['porn_removal_file'])

            if "source_tokenizer_command" in args.metadata_yaml:
                args.source_tokenizer_command = args.metadata_yaml[
                    "source_tokenizer_command"]
            if "target_tokenizer_command" in args.metadata_yaml:
                args.target_tokenizer_command = args.metadata_yaml[
                    "target_tokenizer_command"]

            parser.set_defaults(**args.metadata_yaml)

        except:
            logging.warning("Error loading metadata.")
            args.disable_lm_filter = True
            args.disable_porn_removal = True
            traceback.print_exc()
            #sys.exit(1)
    else:
        if args.metadata == None:
            logging.warning("Metadata file not provided.")
            args.disable_lm_filter = True
            args.disable_porn_removal = True

    if (args.source_lang == None or args.target_lang == None):
        if (args.metadata == None):
            logging.error("No source or target languages provided.")
            sys.exit(1)
        else:
            try:
                if not "metadata_yaml" in args or args.metadata_yaml == None:
                    args.metadata_yaml = yaml.safe_load(args.metadata)
                #args.metadata_yaml["yamlpath"] = os.path.dirname(os.path.abspath(args.metadata.name))

                args.source_lang = args.metadata_yaml["source_lang"]
                args.target_lang = args.metadata_yaml["target_lang"]
            except:
                traceback.print_exc()
                logging.error(
                    "Error retrieving source or target languages from metadata."
                )
                sys.exit(1)

    if args.disable_lm_filter:
        logging.info("LM filtering disabled.")
    if args.disable_porn_removal:
        logging.info("P**n removal disabled.")

    return args
Ejemplo n.º 14
0
def initialization():
    global nline
    global logging_level
    
    nline = 0
    logging.info("Processing arguments...")
    # Getting arguments and options with argparse
    # Initialization of the argparse class
    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__)
    # Mandatory parameters
    ## Input file. Try to open it to check if it exists
    parser.add_argument('input', type=argparse.FileType('rt'), default=None, help="Tab-separated files to be classified")      
    parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="Output of the classification")
    parser.add_argument('metadata', type=argparse.FileType('r'), default=None, help="Training metadata (YAML file)")    

    # Options group
    groupO = parser.add_argument_group('Optional')
    groupO.add_argument("-S", "--source_tokeniser_path", type=str, help="Source language (SL) tokeniser executable absolute path")
    groupO.add_argument("-T", "--target_tokeniser_path", type=str, help="Target language (TL) tokeniser executable absolute path")

    groupO.add_argument("--scol", default=3, type=check_positive, help ="Source sentence column (starting in 1)")
    groupO.add_argument("--tcol", default=4, type=check_positive, help ="Target sentence column (starting in 1)")    


    groupO.add_argument('--tmp_dir', default=gettempdir(), help="Temporary directory where creating the temporary files of this program")
    groupO.add_argument('-d', '--discarded_tus', type=argparse.FileType('w'), default=None, help="TSV file with discarded TUs. Discarded TUs by the classifier are written in this file in TSV file.")
    groupO.add_argument('--threshold', type=check_positive_between_zero_and_one, default=0.5, help="Threshold for classifier. If accuracy histogram is present in metadata, the interval for max value will be given as a default instead the current default.")
    groupO.add_argument('--lm_threshold',type=check_positive_between_zero_and_one, default=0.5, help="Threshold for language model fluency scoring. All TUs whose LM fluency score falls below the threshold will are removed (classifier score set to 0), unless the option --keep_lm_result set.")
    groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score and do not discard any TU based on that score.")
     
    groupO.add_argument('--score_only',action='store_true', help="Only output one column which is the bicleaner score", default=False)
     
    groupO.add_argument('--disable_hardrules',action = 'store_true', help = "Disables the bicleaner_hardrules filtering (only bicleaner_classify is applied)")
    groupO.add_argument('--disable_lang_ident', default=False, action='store_true', help="Don't apply hardrules that use language detecting")
    
    # Logging group
    groupL = parser.add_argument_group('Logging')
    groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode')
    groupL.add_argument('--debug', action='store_true', help='Debug logging mode')
    groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file")
    groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit")

    # Validating & parsing
    # Checking if metadata is specified
    args = parser.parse_args()
    logging_setup(args)
    
    logging_level = logging.getLogger().level    

    if logging_level <= logging.WARNING and logging_level != logging.DEBUG:
        #Getting rid of INFO messages when Moses processes start
        logging.getLogger("MosesTokenizer").setLevel(logging.WARNING)
        logging.getLogger("MosesSentenceSplitter").setLevel(logging.WARNING)
        logging.getLogger("MosesPunctuationNormalizer").setLevel(logging.WARNING)
            
    try: 
        yamlpath = os.path.dirname(os.path.abspath(args.metadata.name))

        metadata_yaml = yaml.load(args.metadata)      

        args.source_lang=metadata_yaml["source_lang"]
        args.target_lang=metadata_yaml["target_lang"]
        if "source_tokeniser_path" in metadata_yaml:
            args.source_tokeniser_path=metadata_yaml["source_tokeniser_path"]
        if "target_tokeniser_path" in metadata_yaml:
            args.target_tokeniser_path=metadata_yaml["target_tokeniser_path"]

        try:
            args.clf=joblib.load( os.path.join( yamlpath , metadata_yaml["classifier"]))
        except:            
            args.clf=joblib.load(metadata_yaml["classifier"])
        
#        args.clf.n_jobs = None    
        args.classifier_type=metadata_yaml["classifier_type"]


        try:
            args.dict_sl_tl = ProbabilisticDictionary( os.path.join(yamlpath , metadata_yaml["source_dictionary"]))
        except:
            args.dict_sl_tl = ProbabilisticDictionary(metadata_yaml["source_dictionary"])                
        try:            
            args.dict_tl_sl = ProbabilisticDictionary( os.path.join(yamlpath , metadata_yaml["target_dictionary"]))        
        except:
            args.dict_tl_sl = ProbabilisticDictionary(metadata_yaml["target_dictionary"])        
        
                
        args.normalize_by_length = metadata_yaml["normalize_by_length"]
        args.treat_oovs = metadata_yaml["treat_oovs"]
        args.qmax_limit = metadata_yaml["qmax_limit"]
        args.disable_features_quest = metadata_yaml["disable_features_quest"]
        args.good_examples = metadata_yaml["good_examples"]
        args.wrong_examples = metadata_yaml["wrong_examples"]
        args.good_test_examples = metadata_yaml["good_test_examples"]
        args.wrong_test_examples = metadata_yaml["wrong_test_examples"]
        args.length_ratio = metadata_yaml["length_ratio"]
        args.features_version = 1 if "features_version" not in metadata_yaml else int(metadata_yaml["features_version"])
        
        threshold = np.argmax(metadata_yaml["accuracy_histogram"])*0.1
        logging.info("Accuracy histogram: {}".format(metadata_yaml["accuracy_histogram"]))
        logging.info("Ideal threshold: {:1.1f}".format(threshold))
        metadata_yaml["threshold"] = threshold
        
        #Load LM stuff if model was trained with it 
        if "source_lm" in metadata_yaml and "target_lm" in metadata_yaml:
            lmFilter = DualLMFluencyFilter( LMType[metadata_yaml['lm_type']] ,args.source_lang, args.target_lang)
            stats=DualLMStats( metadata_yaml['clean_mean_perp'],metadata_yaml['clean_stddev_perp'],metadata_yaml['noisy_mean_perp'],metadata_yaml['noisy_stddev_perp'] )
            
            fullpath_source_lm=os.path.join(yamlpath,metadata_yaml['source_lm'])
            if os.path.isfile(fullpath_source_lm):
                source_lm= fullpath_source_lm
            else:
                source_lm= metadata_yaml['source_lm']
            
            fullpath_target_lm=os.path.join(yamlpath,metadata_yaml['target_lm'])
            if os.path.isfile(fullpath_target_lm):
                target_lm=fullpath_target_lm
            else:
                target_lm=metadata_yaml['target_lm']
            lmFilter.load(source_lm, target_lm ,stats)
            args.lm_filter=lmFilter
        else:
            args.lm_filter=None
        
        logging.debug("YAML")
        logging.debug(metadata_yaml)
        parser.set_defaults(**metadata_yaml)   
   
    except:
        print("Error loading metadata")
        traceback.print_exc()
        sys.exit(1)
    
    # Ensure that directory exists; if not, create it
    if not os.path.exists(args.tmp_dir):
        os.makedirs(args.tmp_dir)

    if args.score_only and args.keep_lm_result:
        raise AssertionError("Conflicting arguments: cannot output bicleaner score only AND keep language model result")

    logging.debug("Arguments processed: {}".format(str(args)))
    logging.info("Arguments processed.")
    return args
Ejemplo n.º 15
0
      tb = traceback.format_exc()
      print("Unable to extract text from TMX")
      logging.error(tb)
      sys.exit(1)
  else:
    sentences = args.input
      
 
  source_names_module = binonymizer_core.selectNamesModule(args.srclang)
  target_names_module = binonymizer_core.selectNamesModule(args.trglang)
  

  binonymizer_process(args, sentences, regex_module, source_names_module, target_names_module, address_module)

  #To do: rebuild tmx files with anotations from binonymizer
  if args.format=="tmx":
   #Rebuild TMX with anon 
   logging.warning("********************* Unsupported feature!! ********************")
   pass
  logging.info("Program finished")

if __name__ == '__main__':
  try:
    util.logging_setup()
    args = initialization() # Parsing parameters
    main(args)  # Running main program
  except Exception as ex:
    tb = traceback.format_exc()
    logging.error(tb)
    sys.exit(1)
Ejemplo n.º 16
0
def initialization():
    logging.info("Processing arguments...")
    # Getting arguments and options with argparse
    # Initialization of the argparse class
    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__)
    # Mandatory parameters
    ## Input file. Try to open it to check if it exists
    parser.add_argument('input', type=argparse.FileType('rt'), default=None, help="Tab-separated files to be classified")      
    parser.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="Output of the classification")
    parser.add_argument('metadata', type=argparse.FileType('r'), default=None, help="Training metadata (YAML file)")    

    ## Parameters required
    #groupM = parser.add_argument_group('Mandatory')


    # Options group
    groupO = parser.add_argument_group('Optional')
    groupO.add_argument("-S", "--source_tokeniser_path", type=str, help="Source language (SL) tokeniser executable absolute path")
    groupO.add_argument("-T", "--target_tokeniser_path", type=str, help="Target language (TL) tokeniser executable absolute path")
    
    groupO.add_argument('--tmp_dir', default=gettempdir(), help="Temporary directory where creating the temporary files of this program")
    groupO.add_argument('-b', '--block_size', type=int, default=200, help="Sentence pairs per block")
    groupO.add_argument('-p', '--processes', type=int, default=max(1, cpu_count()-1), help="Number of processes to use")
    
    groupO.add_argument('-d', '--discarded_tus', type=argparse.FileType('w'), default=None, help="TSV file with discarded TUs. Discarded TUs by the classifier are written in this file in TSV file.")
    groupO.add_argument('--threshold', type=check_positive_between_zero_and_one, default=0.5, help="Threshold for classifier. If accuracy histogram is present in metadata, the interval for max value will be given as a default instead the current default.")
    groupO.add_argument('--lm_threshold',type=check_positive_between_zero_and_one, default=0.5, help="Threshold for language model fluency scoring. All TUs whose LM fluency score falls below the threshold will are removed (classifier score set to 0), unless the option --keep_lm_result set.")
    groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score and do not discard any TU based on that score.")
    
    # Logging group
    groupL = parser.add_argument_group('Logging')
    groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode')
    groupL.add_argument('--debug', action='store_true', help='Debug logging mode')
    groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file")
    groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit")

    # Validating & parsing
    # Checking if metadata is specified
    args = parser.parse_args()
    logging_setup(args)


    
    try: 
        yamlpath = os.path.dirname(os.path.abspath(args.metadata.name))

        metadata_yaml = yaml.load(args.metadata)      

        args.source_lang=metadata_yaml["source_lang"]
        args.target_lang=metadata_yaml["target_lang"]
        if "source_tokeniser_path" in metadata_yaml:
            args.source_tokeniser_path=metadata_yaml["source_tokeniser_path"]
        if "target_tokeniser_path" in metadata_yaml:
            args.target_tokeniser_path=metadata_yaml["target_tokeniser_path"]        

        try:
            args.clf=joblib.load( os.path.join(yamlpath , metadata_yaml["classifier"]))
        except:            
            args.clf=joblib.load(metadata_yaml["classifier"])
        
#        args.clf.n_jobs = None    
        args.classifier_type=metadata_yaml["classifier_type"]


        try:
            args.dict_sl_tl = ProbabilisticDictionary( os.path.join( yamlpath, metadata_yaml["source_dictionary"]))
        except:
            args.dict_sl_tl = ProbabilisticDictionary(metadata_yaml["source_dictionary"])                
        try:            
            args.dict_tl_sl = ProbabilisticDictionary( os.path.join( yamlpath , metadata_yaml["target_dictionary"]))        
        except:
            args.dict_tl_sl = ProbabilisticDictionary(metadata_yaml["target_dictionary"])        
        
                
        args.normalize_by_length = metadata_yaml["normalize_by_length"]
        args.treat_oovs = metadata_yaml["treat_oovs"]
        args.qmax_limit = metadata_yaml["qmax_limit"]
        args.disable_features_quest = metadata_yaml["disable_features_quest"]
        args.good_examples = metadata_yaml["good_examples"]
        args.wrong_examples = metadata_yaml["wrong_examples"]
        args.good_test_examples = metadata_yaml["good_test_examples"]
        args.wrong_test_examples = metadata_yaml["wrong_test_examples"]
        args.length_ratio = metadata_yaml["length_ratio"]
        args.features_version = 1 if  "features_version" not in metadata_yaml else int(metadata_yaml["features_version"])
        
        threshold = np.argmax(metadata_yaml["accuracy_histogram"])*0.1
        logging.info("Accuracy histogram: {}".format(metadata_yaml["accuracy_histogram"]))
        logging.info("Ideal threshold: {:1.1f}".format(threshold))
        metadata_yaml["threshold"] = threshold
        
        #Load LM stuff if model was trained with it 
        if "source_lm" in metadata_yaml and "target_lm" in metadata_yaml:
            fullpath_source_lm=os.path.join(yamlpath,metadata_yaml['source_lm'])
            if os.path.isfile(fullpath_source_lm):
                args.source_lm= fullpath_source_lm
            else:
                args.source_lm= metadata_yaml['source_lm']
            
            fullpath_target_lm=os.path.join(yamlpath,metadata_yaml['target_lm'])
            if os.path.isfile(fullpath_target_lm):
                args.target_lm=fullpath_target_lm
            else:
                args.target_lm=metadata_yaml['target_lm']
            
            
            args.lm_type=LMType[metadata_yaml['lm_type']]
            stats=DualLMStats( metadata_yaml['clean_mean_perp'],metadata_yaml['clean_stddev_perp'],metadata_yaml['noisy_mean_perp'],metadata_yaml['noisy_stddev_perp'] )
            args.lm_filter_stats=stats
        else:
            args.source_lm=None
            args.target_lm=None
            args.lm_type=None
            args.lm_filter_stats=None
            
            
        
        logging.debug("YAML")
        logging.debug(metadata_yaml)
        parser.set_defaults(**metadata_yaml)   
   
    except:
        print("Error loading metadata")
        traceback.print_exc()
        sys.exit(1)
    
    # Ensure that directory exists; if not, create it
    if not os.path.exists(args.tmp_dir):
        os.makedirs(args.tmp_dir)

    logging.debug("Arguments processed: {}".format(str(args)))
    logging.info("Arguments processed.")
    return args
Ejemplo n.º 17
0
def initialization():
    global ilines
    global olines

    ilines = 0
    olines = 0

    logging.info("Processing arguments...")
    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=__doc__)

    # Mandatory parameters
    # Input file
    parser.add_argument('input',
                        type=argparse.FileType('rt'),
                        default=None,
                        help="Tab-separated files to be bifixed")
    # Output file (corpus)
    parser.add_argument('output',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help="Fixed corpus")
    # Source language
    parser.add_argument("srclang",
                        type=str,
                        help="Source language (SL) of the input")
    # Target language
    parser.add_argument("trglang",
                        type=str,
                        help="Target language (TL) of the input")

    # Mandatory parameters
    groupM = parser.add_argument_group('Mandatory')

    # Options group
    groupO = parser.add_argument_group('Optional')
    # Format
    groupO.add_argument("--scol",
                        default=3,
                        type=util.check_positive,
                        help="Source sentence column (starting in 1)")
    groupO.add_argument("--tcol",
                        default=4,
                        type=util.check_positive,
                        help="Target sentence column (starting in 1)")
    groupO.add_argument(
        "--sdeferredcol",
        type=util.check_positive,
        help="Source deferred standoff annotation column (starting in 1)")
    groupO.add_argument(
        "--tdeferredcol",
        type=util.check_positive,
        help="Target deferred standoff annotation column (starting in 1)")

    # Character fixing
    groupO.add_argument(
        '--ignore_characters',
        default=False,
        action='store_true',
        help="Doesn't fix mojibake, orthography, or other character issues")

    # Empty sides
    groupO.add_argument(
        '--ignore_empty',
        default=False,
        action='store_true',
        help="Doesn't remove sentences with empty source or target")

    # Too long sides
    groupO.add_argument('--ignore_long',
                        default=False,
                        action='store_true',
                        help="Doesn't ignore too long sentences")

    # Orthography
    groupO.add_argument('--ignore_orthography',
                        default=False,
                        action='store_true',
                        help="Doesn't apply orthography fixing")

    # Deduplication
    groupO.add_argument('--ignore_duplicates',
                        default=False,
                        action='store_true',
                        help="Doesn't obtain the hashes of parallel sentences")
    groupO.add_argument(
        '--aggressive_dedup',
        default=False,
        action='store_true',
        help=
        "Treats similar sentences as duplicates (marking them with the same hash)"
    )

    # Segmentation
    groupO.add_argument('--ignore_segmentation',
                        default=False,
                        action='store_true',
                        help="Doesn't change segmentation of long sentences")
    groupO.add_argument(
        '--words_before_segmenting',
        default=15,
        type=util.check_positive,
        help=
        "Max words allowed in one side of a parallel sentence before trying to segmentate it. Set to 0 to applicate segmentation on everything."
    )
    groupO.add_argument('--segmenter',
                        default="nltk",
                        type=str,
                        choices=["nltk", "loomchild"],
                        help="Segmenter module.")
    groupO.add_argument(
        '--tmp_dir',
        default=gettempdir(),
        help=
        "Temporary directory where creating the temporary files of this program"
    )

    # Logging group
    groupL = parser.add_argument_group('Logging')
    groupL.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Silent logging mode')
    groupL.add_argument('--debug',
                        action='store_true',
                        help='Debug logging mode')
    groupL.add_argument('--logfile',
                        type=argparse.FileType('a'),
                        default=sys.stderr,
                        help="Store log to a file")
    groupL.add_argument('-v',
                        '--version',
                        action='version',
                        version="%(prog)s " + __version__,
                        help="show version of this script and exit")

    # Validating & parsing
    args = parser.parse_args()
    util.logging_setup(args)
    args.dedup = not args.ignore_duplicates  # more friendly usage of the ignore_duplicates flag

    logging.debug("Arguments processed: {}".format(str(args)))

    logging.info("Arguments processed.")

    return args
Ejemplo n.º 18
0
    output_queue.put(None)
    reduce.join()

    if args.annotated_output:
        args.annotated_output.close()

    # Stats
    logging.info("Finished")
    elapsed_time = default_timer() - time_start
    logging.info("Total: {0} rows".format(nline))
    logging.info("Elapsed time {0:.2f} s".format(elapsed_time))
    logging.info("Troughput: {0} rows/s".format(
        int((nline * 1.0) / elapsed_time)))


def main(args):
    logging.info("Executing main program...")
    perform_hardrules_filtering(args)
    logging.info("Program finished")


if __name__ == '__main__':
    try:
        logging_setup()
        args = initialization()
        main(args)
    except Exception as ex:
        tb = traceback.format_exc()
        logging.error(tb)
        sys.exit(1)
Ejemplo n.º 19
0
def initialization():

    global logging_level

    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=__doc__)

    parser.add_argument('input',
                        nargs='?',
                        type=argparse.FileType('r'),
                        default=sys.stdin,
                        help="Tab-separated bilingual input file")

    groupM = parser.add_argument_group("Mandatory")
    groupM.add_argument('-m',
                        '--metadata',
                        type=argparse.FileType('w'),
                        required=True,
                        help="Training metadata (YAML file)")
    groupM.add_argument('-c',
                        '--classifier',
                        type=argparse.FileType('wb'),
                        required=True,
                        help="Classifier data file")
    groupM.add_argument('-s',
                        '--source_lang',
                        required=True,
                        help="Source language")
    groupM.add_argument('-t',
                        '--target_lang',
                        required=True,
                        help="Target language")
    groupM.add_argument('-d',
                        '--source_dictionary',
                        type=argparse.FileType('r'),
                        required=True,
                        help="LR gzipped probabilistic dictionary")
    groupM.add_argument('-D',
                        '--target_dictionary',
                        type=argparse.FileType('r'),
                        required=True,
                        help="RL gzipped probabilistic dictionary")
    groupM.add_argument('-f',
                        '--source_word_freqs',
                        type=argparse.FileType('r'),
                        default=None,
                        required=True,
                        help="L language gzipped list of word frequencies")
    groupM.add_argument('-F',
                        '--target_word_freqs',
                        type=argparse.FileType('r'),
                        default=None,
                        required=True,
                        help="R language gzipped list of word frequencies")

    groupO = parser.add_argument_group('Options')
    groupO.add_argument('-S',
                        '--source_tokenizer_command',
                        help="Source language tokenizer full command")
    groupO.add_argument('-T',
                        '--target_tokenizer_command',
                        help="Target language tokenizer full command")
    groupO.add_argument('--normalize_by_length',
                        action='store_true',
                        help="Normalize by length in qmax dict feature")
    groupO.add_argument('--treat_oovs',
                        action='store_true',
                        help="Special treatment for OOVs in qmax dict feature")
    groupO.add_argument(
        '--qmax_limit',
        type=check_positive_or_zero,
        default=40,
        help=
        "Number of max target words to be taken into account, sorted by length"
    )
    groupO.add_argument('--disable_features_quest',
                        action='store_false',
                        help="Disable less important features")
    groupO.add_argument('--classifier_type',
                        choices=[
                            'mlp', 'extra_trees', 'svm', 'nn', 'nn1',
                            'adaboost', 'random_forest'
                        ],
                        default="extra_trees",
                        help="Classifier type")
    groupO.add_argument('--dump_features',
                        type=argparse.FileType('w'),
                        default=None,
                        help="Dump training features to file")
    groupO.add_argument('-b',
                        '--block_size',
                        type=check_positive,
                        default=10000,
                        help="Sentence pairs per block")
    groupO.add_argument('-p',
                        '--processes',
                        type=check_positive,
                        default=max(1,
                                    cpu_count() - 1),
                        help="Number of process to use")
    groupO.add_argument(
        '--wrong_examples_file',
        type=argparse.FileType('r'),
        default=None,
        help=
        "File with wrong examples extracted to replace the synthetic examples from method used by default"
    )
    groupO.add_argument('--features_version',
                        type=check_positive,
                        default=FEATURES_VERSION,
                        help="Version of the features")
    groupO.add_argument(
        '--disable_lang_ident',
        default=False,
        action='store_true',
        help="Don't apply features that use language detecting")
    groupO.add_argument(
        '--seed',
        default=None,
        type=int,
        help="Seed for random number generation: by default, no seeed is used")
    groupO.add_argument(
        '--relative_paths',
        action='store_true',
        help=
        "Ask training to save model files by relative path if they are in the same directory as metadata. Useful if you are going to train distributable models."
    )

    #For LM filtering
    groupO.add_argument(
        '--noisy_examples_file_sl',
        type=str,
        help=
        "File with noisy text in the SL. These are used to estimate the perplexity of noisy text."
    )
    groupO.add_argument(
        '--noisy_examples_file_tl',
        type=str,
        help=
        "File with noisy text in the TL. These are used to estimate the perplexity of noisy text."
    )
    groupO.add_argument(
        '--lm_dev_size',
        type=check_positive_or_zero,
        default=2000,
        help=
        "Number of sentences to be removed from clean text before training LMs. These are used to estimate the perplexity of clean text."
    )
    groupO.add_argument('--lm_file_sl',
                        type=str,
                        help="SL language model output file.")
    groupO.add_argument('--lm_file_tl',
                        type=str,
                        help="TL language model output file.")
    groupO.add_argument(
        '--lm_training_file_sl',
        type=str,
        help=
        "SL text from which the SL LM is trained. If this parameter is not specified, SL LM is trained from the SL side of the input file, after removing --lm_dev_size sentences."
    )
    groupO.add_argument(
        '--lm_training_file_tl',
        type=str,
        help=
        "TL text from which the TL LM is trained. If this parameter is not specified, TL LM is trained from the TL side of the input file, after removing --lm_dev_size sentences."
    )
    groupO.add_argument(
        '--lm_clean_examples_file_sl',
        type=str,
        help=
        "File with clean text in the SL. Used to estimate the perplexity of clean text. This option must be used together with --lm_training_file_sl and both files must not have common sentences. This option replaces --lm_dev_size."
    )
    groupO.add_argument(
        '--lm_clean_examples_file_tl',
        type=str,
        help=
        "File with clean text in the TL. Used to estimate the perplexity of clean text. This option must be used together with --lm_training_file_tl and both files must not have common sentences. This option replaces --lm_dev_size."
    )

    groupO.add_argument(
        '--porn_removal_train',
        type=argparse.FileType('r'),
        help=
        "File with training dataset for FastText classifier. Each sentence must contain at the beginning the '__label__negative' or '__label__positive' according to FastText convention. It should be lowercased and tokenized."
    )
    groupO.add_argument(
        '--porn_removal_test',
        type=argparse.FileType('r'),
        help=
        "Test set to compute precision and accuracy of the p**n removal classifier"
    )
    groupO.add_argument('--porn_removal_file',
                        type=str,
                        help="P**n removal classifier output file")
    groupO.add_argument(
        '--porn_removal_side',
        choices=['sl', 'tl'],
        default="sl",
        help=
        "Whether the p**n removal should be applied at the source or at the target language."
    )

    groupL = parser.add_argument_group('Logging')
    groupL.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Silent logging mode')
    groupL.add_argument('--debug',
                        action='store_true',
                        help='Debug logging mode')
    groupL.add_argument('--logfile',
                        type=argparse.FileType('a'),
                        default=sys.stderr,
                        help="Store log to a file")

    args = parser.parse_args()
    if args.seed is not None:
        np.random.seed(args.seed)
        random.seed(args.seed)

    # Logging
    logging_setup(args)
    logging_level = logging.getLogger().level

    return args