コード例 #1
0
def classifier_process(i, jobs_queue, output_queue, args):
    with MosesTokenizer(args.source_lang) as source_tokenizer, MosesTokenizer(args.target_lang) as target_tokenizer:
        while True:
            job = jobs_queue.get()
            if job:
                logging.debug("Job {0}".format(job.__repr__()))
                nblock, filein_name = job
                ojob = None
                with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False, dir=args.tmp_dir) as fileout:
                    logging.debug("Classification: creating temporary filename {0}".format(fileout.name))
                    feats = []
                    # TODO Test times with predict one-by-one and this impl
                    for i in filein:
                        features = feature_extract(i, source_tokenizer, target_tokenizer, args)
                        feats.append([float(v) for v in features])

                    if len(feats) > 0:
                        prediction = args.clf.predict_proba(np.array(feats))

                        row = 0
                        for pred in prediction:
                            fileout.write("{}\n".format(str(pred[1])))
                            row += 1
                    
                    ojob = (nblock, fileout.name)
                    filein.close()
                    fileout.close()
                if ojob:
                    output_queue.put(ojob)
                    
                os.unlink(filein_name)
            else:
                logging.debug("Exiting worker")
                break
コード例 #2
0
ファイル: bicleaner-train.py プロジェクト: sortiz/bicleaner
def worker_process(i, jobs_queue, output_queue, args):
    with MosesTokenizer(args.source_lang) as tokl, \
         MosesTokenizer(args.target_lang) as tokr:
        while True:
            job = jobs_queue.get()
            if job:
                logging.debug("Job {}".format(job.__repr__()))
                nblock, filein_name, label = job

                with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False) as fileout:
                    logging.debug("Filtering: creating temporary file {}".format(fileout.name))
                    for i in filein:
                        features = feature_extract(i, tokl, tokr, args)
                        
                        for j in features:
                            fileout.write("{}".format(j))
                            fileout.write("\t")
                        fileout.write("{}".format(label))
                        fileout.write("\n")
                    ojob = (nblock, fileout.name)
                    fileout.close()
                    filein.close()
                    output_queue.put(ojob)
                os.unlink(filein_name)
            else:
                logging.debug("Exiting worker")
                break
コード例 #3
0
def classifier_process(i, jobs_queue, output_queue, args):
    with MosesTokenizer(args.source_lang) as source_tokenizer, MosesTokenizer(
            args.target_lang) as target_tokenizer:
        while True:
            job = jobs_queue.get()
            if job:
                logging.debug("Job {0}".format(job.__repr__()))
                nblock, filein_name = job
                with open(filein_name, 'r') as filein, NamedTemporaryFile(
                        mode="w", delete=False, dir=args.tmp_dir) as fileout:
                    logging.debug(
                        "Classification: creating temporary filename {0}".
                        format(fileout.name))
                    feats = []
                    temp_lines = []
                    # TODO Test times with predict one-by-one and this impl
                    for i in filein:
                        parts = i.strip().split("\t")
                        line = ""
                        temp_lines.append(i)
                        if len(parts) == 7:
                            # Last two columns are the language pair
                            if parts[-2] == args.source_lang and parts[
                                    -1] == args.target_lang:
                                line = "{}\t{}\n".format(parts[1], parts[3])
                            elif parts[-1] == args.source_lang and parts[
                                    -2] == args.source_lang:
                                line = "{}\t{}\n".format(parts[3], parts[1])
                            features = feature_extract(line, source_tokenizer,
                                                       target_tokenizer, args)
                            feats.append([float(v) for v in features])
                        else:
                            logging.debug(
                                "Line not included in process: {}".format(i))

                    if len(feats) > 0:
                        prediction = args.clf.predict_proba(np.array(feats))

                        row = 0
                        for pred in prediction:
                            while not temp_lines[row].startswith("<tu "):
                                fileout.write(temp_lines[row])
                                row += 1
                            fileout.write("{}\t{}\n".format(
                                temp_lines[row].strip("\n"), str(pred[1])))
                            row += 1
                    else:
                        for l in temp_lines:
                            fileout.write(l)

                    ojob = (nblock, fileout.name)
                    filein.close()
                    fileout.close()
                    output_queue.put(ojob)

                os.unlink(filein_name)
            else:
                logging.debug("Exiting worker")
                break
コード例 #4
0
ファイル: classify.py プロジェクト: MrTomWhite/cx-miner
def ev_feature_and_label(entry):
    label = entry['label']
    document_words = set(split_words(entry['fulltext']))
    features_extracted = features.feature_extract(entry)
    for word in word_features:
        features_extracted['contains(%s)' % word] = word in document_words

    return (features_extracted, label)
コード例 #5
0
ファイル: classify.py プロジェクト: MrTomWhite/cx-miner
def feature_and_label(entry):
    label = entry['label']
    document_words = set(split_words(entry['fulltext']))
    features_extracted = features.feature_extract(entry)
    for word in word_features:
        features_extracted['contains(%s)' % word] = word in document_words

    entries_by_label[label].append(features_extracted)
    return (label, features_extracted)
コード例 #6
0
def classifier_process(i, jobs_queue, output_queue, args):
    with MosesTokenizer(args.source_lang) as source_tokenizer, MosesTokenizer(args.target_lang) as target_tokenizer:
        while True:
            job = jobs_queue.get()
            if job:
                logging.debug("Job {0}".format(job.__repr__()))
                nblock, filein_name = job
                ojob = None
                with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False, dir=args.tmp_dir) as fileout:
                    logging.debug("Classification: creating temporary filename {0}".format(fileout.name))
                    feats = []

                    for i in filein:
                        parts = i.split("\t")
                        if len(parts) >= 2 and len(parts[0].strip()) != 0 and len(parts[1].strip()) != 0:
                            features = feature_extract(i, source_tokenizer, target_tokenizer, args)
                            # print("SENTENCE PAIR: %%{}%%".format(i))
                            # print(Features(features)) # debug
                            feats.append([float(v) for v in features])

                    predictions = args.clf.predict_proba(np.array(feats)) if len(feats) > 0 else []
                    filein.seek(0)

                    piter = iter(predictions)
                    for i in filein:
                        parts = i.split("\t")
                        if len(parts) >= 2 and len(parts[0].strip()) != 0 and len(parts[1].strip()) != 0:
                            p = next(piter)
                            fileout.write(i.strip())
                            fileout.write("\t")
                            fileout.write(str(p[1]))
                            fileout.write("\n")
                        else:
                            fileout.write(i.strip("\n"))
                            fileout.write("\t0\n")

                    ojob = (nblock, fileout.name)
                    filein.close()
                    fileout.close()
                 
                if ojob:                    
                    output_queue.put(ojob)
                    
                os.unlink(filein_name)
            else:
                logging.debug("Exiting worker")
                break
コード例 #7
0
def worker_process(i, jobs_queue, output_queue, args):
    if args.source_tokeniser_path:
        source_tokeniser = ToolWrapper(args.source_tokeniser_path.split(' '))
    else:
        source_tokeniser = MosesTokenizer(args.source_lang)
    if args.target_tokeniser_path:
        target_tokeniser = ToolWrapper(args.target_tokeniser_path.split(' '))
    else:
        target_tokeniser = MosesTokenizer(args.target_lang)
    while True:
        job = jobs_queue.get()
        if job:
            logging.debug("Job {}".format(job.__repr__()))
            nblock, filein_name, label = job

            with open(filein_name, 'r') as filein, NamedTemporaryFile(
                    mode="w", delete=False) as fileout:
                logging.debug("Filtering: creating temporary file {}".format(
                    fileout.name))
                for i in filein:
                    srcsen, trgsen = i.split("\t")[:2]
                    trgsen = trgsen.strip()
                    #                    print(str(srcsen) + " --- " + str(trgsen))
                    features = feature_extract(srcsen, trgsen,
                                               source_tokeniser,
                                               target_tokeniser, args)

                    for j in features:
                        fileout.write("{}".format(j))
                        fileout.write("\t")
                    fileout.write("{}".format(label))
                    fileout.write("\n")
                ojob = (nblock, fileout.name)
                fileout.close()
                filein.close()
                output_queue.put(ojob)
            os.unlink(filein_name)
        else:
            logging.debug("Exiting worker")
            source_tokeniser.close()
            target_tokeniser.close()
            break
コード例 #8
0
def classify(args):
    global nline
    batch_size = 10000
    buf_sent = []
    buf_feat = []
    if args.source_tokeniser_path:
        source_tokeniser = ToolWrapper(args.source_tokeniser_path.split(' '))
    else:
        source_tokeniser = MosesTokenizer(args.source_lang)
    if args.target_tokeniser_path:
        target_tokeniser = ToolWrapper(args.target_tokeniser_path.split(' '))
    else:
        target_tokeniser = MosesTokenizer(args.target_lang)
    for i in args.input:
        nline += 1
        parts = i.split("\t")
        
        sl_sentence=None
        tl_sentence=None
        if len(parts) >= max(args.scol, args.tcol):
            sl_sentence=parts[args.scol -1]
            tl_sentence=parts[args.tcol -1]
        else:
            logging.error("ERROR: scol ({}) or tcol ({}) indexes above column number ({}) on line {}".format(args.scol, args.tcol, len(parts), nline))
                       
        if sl_sentence and tl_sentence and len(sl_sentence.strip()) != 0 and len(tl_sentence.strip()) != 0 and (args.disable_hardrules or wrong_tu(sl_sentence.strip(),tl_sentence.strip(), args)== False):
            lmScore=None
            if args.lm_filter:
                lmScore=args.lm_filter.score(sl_sentence,tl_sentence)
            if lmScore != None and lmScore < args.lm_threshold and not args.keep_lm_result:
                buf_sent.append((0, i,lmScore))
            else:
                buf_sent.append((1, i,lmScore))
                features = feature_extract(sl_sentence, tl_sentence, source_tokeniser, target_tokeniser, args)
                buf_feat.append([float(v) for v in features])
        else:
            lmScore=None
            if args.lm_filter:
                lmScore=0
            buf_sent.append((0, i, lmScore))
        
        if (nline % batch_size) == 0:
            args.clf.set_params(n_jobs = 1)
            predictions = args.clf.predict_proba(np.array(buf_feat)) if len(buf_feat) > 0 else []
            p = iter(predictions)
                
            for k, l, lmScore in buf_sent:
                if k == 1:
                    if args.score_only:
                        args.output.write("{0:.3f}".format((next(p)[1])))
                    else:
                        args.output.write(l.strip())
                        args.output.write("\t")
                        args.output.write("{0:.3f}".format((next(p)[1])))
                        if lmScore != None and args.keep_lm_result:
                            args.output.write("\t")
                            args.output.write("{0:.3f}".format(lmScore))
                    args.output.write("\n")
                else:
                    if args.score_only:
                        args.output.write("0")
                    else:    
                        args.output.write(l.strip("\n"))
                        args.output.write("\t0")
                        if lmScore != None and args.keep_lm_result:
                            args.output.write("\t0")
                    args.output.write("\n")

            buf_feat = []
            buf_sent = []

    if len(buf_sent) > 0:
        predictions = args.clf.predict_proba(np.array(buf_feat)) if len(buf_feat) > 0 else []
        p = iter(predictions)
            
        for k, l, lmScore in buf_sent:
            if k == 1:
                if args.score_only:
                    args.output.write("{0:.3f}".format((next(p)[1])))
                else:
                    args.output.write(l.strip())
                    args.output.write("\t")
                    args.output.write("{0:.3f}".format((next(p)[1])))
                    if lmScore != None and args.keep_lm_result:
                        args.output.write("\t")
                        args.output.write("{0:.3f}".format(lmScore))
                args.output.write("\n")
            else:
                if args.score_only:
                    args.output.write("0")
                else:    
                    args.output.write(l.strip("\n"))
                    args.output.write("\t0")
                    if lmScore != None and args.keep_lm_result:
                        args.output.write("\t0")
                args.output.write("\n")
コード例 #9
0
def perform_training(args):
    global nline
    time_start = default_timer()
    logging.info("Starting process")

    # Read input to a named temporary file
    # We may need to read it multiple times and that would be problematic if it is sys.stdin
    input = NamedTemporaryFile(mode="w", delete=False)
    for line in args.input:
        input.write(line)
    input.close()

    stats = None
    with open(input.name) as input_f:
        args.input = input_f
        stats = train_fluency_filter(args)
        args.input.seek(0)

        # Shuffle and get length ratio
        total_size, length_ratio, good_sentences, wrong_sentences = shuffle(
            args.input, args.good_examples + args.good_test_examples,
            args.wrong_examples + args.wrong_test_examples,
            args.wrong_examples_file)
    os.remove(input.name)

    args.length_ratio = length_ratio

    # Load dictionaries
    args.dict_sl_tl = ProbabilisticDictionary(args.source_dictionary)
    args.dict_tl_sl = ProbabilisticDictionary(args.target_dictionary)

    features_file = NamedTemporaryFile(delete=False)
    if args.source_tokeniser_path:
        tokl = ToolWrapper(args.source_tokeniser_path.split(' '))
    else:
        tokl = MosesTokenizer(args.source_lang)
    if args.target_tokeniser_path:
        tokr = ToolWrapper(args.target_tokeniser_path.split(' '))
    else:
        tokr = MosesTokenizer(args.target_lang)
    with open(good_sentences.name, 'r') as gsf, \
            open(wrong_sentences.name, 'r') as wsf, \
            open(features_file.name, 'w+') as fileout:

        for i in gsf:
            srcsen, trgsen = i.split("\t")[:2]
            #            print(str(i) + " ---" + str(srcsen) + " --- " + str(trgsen))
            features = feature_extract(srcsen, trgsen, tokl, tokr, args)
            for j in features:
                fileout.write("{}".format(j))
                fileout.write("\t")
            fileout.write("{}".format(1))
            fileout.write("\n")
        fileout.flush()

        for i in wsf:
            srcsen, trgsen = i.split("\t")[:2]
            #            print(str(i) + " ---" + str(srcsen) + " --- " + str(trgsen))
            features = feature_extract(srcsen, trgsen, tokl, tokr, args)
            for j in features:
                fileout.write("{}".format(j))
                fileout.write("\t")
            fileout.write("{}".format(0))
            fileout.write("\n")
        fileout.flush()
    tokl.close()
    tokr.close()

    features_file.seek(0)

    if args.dump_features:
        logging.info("Dumping features to " +
                     os.path.abspath(args.dump_features.name))
        for i in features_file:
            args.dump_features.write(i)
        args.dump_features.close()
        features_file.seek(0)

    logging.info("Start training")
    features_file.close()

    hgood = []
    hwrong = []
    with TemporaryFile("w+") as features_train, TemporaryFile(
            "w+") as features_test, open(features_file.name, 'r') as ff:
        nline = 0
        for line in ff:
            #            print(line)
            if nline < args.good_examples:
                features_train.write(line)
            elif nline < args.good_examples + args.good_test_examples:
                features_test.write(line)
            elif nline < args.good_examples + args.good_test_examples + args.wrong_examples:
                features_train.write(line)
            else:
                features_test.write(line)
            nline += 1

        features_train.flush()
        features_test.flush()

        features_train.seek(0)
        features_test.seek(0)
        hgood, hwrong = train_classifier(features_train, features_test,
                                         args.classifier_type, args.classifier)
        features_train.close()
        features_test.close()

    logging.info("End training")

    write_metadata(args, length_ratio, hgood, hwrong, stats)
    args.metadata.close()

    # Stats
    logging.info("Finished")
    elapsed_time = default_timer() - time_start
    logging.info("Elapsed time {:.2f} s".format(elapsed_time))
コード例 #10
0
def classifier_process(i, jobs_queue, output_queue, args):

    source_tokenizer = Tokenizer(args.source_tokenizer_command,
                                 args.source_lang)
    target_tokenizer = Tokenizer(args.target_tokenizer_command,
                                 args.target_lang)

    if not args.disable_lm_filter:
        lm_filter = load_lm_filter(args.source_lang, args.target_lang,
                                   args.metadata_yaml,
                                   args.source_tokenizer_command,
                                   args.target_tokenizer_command)
    else:
        lm_filter = None

    if not args.disable_porn_removal:
        porn_removal = args.porn_removal
        if args.metadata_yaml['porn_removal_side'] == 'tl':
            porn_tokenizer = Tokenizer(args.target_tokenizer_command,
                                       args.target_lang)
        else:
            porn_tokenizer = Tokenizer(args.source_tokenizer_command,
                                       args.source_lang)
    else:
        porn_removal = None
        porn_tokenizer = None

    while True:
        job = jobs_queue.get()
        if job:
            logging.debug("Job {0}".format(job.__repr__()))
            nblock, filein_name = job
            ojob = None
            with open(filein_name, 'r') as filein, NamedTemporaryFile(
                    mode="w", delete=False, dir=args.tmp_dir) as fileout:
                logging.debug(
                    "Classification: creating temporary filename {0}".format(
                        fileout.name))
                feats = []
                lm_scores = []

                #Create the following arrays:
                #valid_sentences: boolean, length of input. States whether each sentence passed
                #  hard rules and lm fluency filtering
                #feats: vector of tuples, input features to the classifier, length equals number
                #  of sentences in the input that passed hard rules + lm fluency filtering

                valid_sentences = []
                for i in filein:
                    parts = i.split("\t")
                    sl_sentence = None
                    tl_sentence = None
                    if len(parts) >= max(args.scol, args.tcol):
                        sl_sentence = parts[args.scol - 1]
                        tl_sentence = parts[args.tcol - 1]
                    else:
                        logging.error(
                            "ERROR: scol ({}) or tcol ({}) indexes above column number ({})"
                            .format(args.scol, args.tcol, len(parts)))

                    if sl_sentence and tl_sentence and len(sl_sentence.strip(
                    )) != 0 and len(tl_sentence.strip()) != 0 and (
                            args.disable_hardrules or wrong_tu(
                                sl_sentence.strip(), tl_sentence.strip(), args,
                                lm_filter, porn_removal, porn_tokenizer)
                            == False):
                        #if disable_hardrules == 1 --> the second part (and) is always true
                        features = feature_extract(sl_sentence, tl_sentence,
                                                   source_tokenizer,
                                                   target_tokenizer, args)

                        feats.append([float(v) for v in features])
                        valid_sentences.append(True)
                    else:
                        valid_sentences.append(False)

                predictions = args.clf.predict_proba(
                    np.array(feats)) if len(feats) > 0 else []
                filein.seek(0)

                piter = iter(predictions)

                for i, valid_sentence in zip(filein, valid_sentences):
                    if valid_sentence:
                        p = next(piter)
                        if args.score_only:
                            fileout.write("{0:.3f}".format(p[1]))
                        else:
                            fileout.write(i.strip())
                            fileout.write("\t{0:.3f}".format(p[1]))

                        fileout.write("\n")
                    else:
                        if args.score_only:
                            fileout.write("0")
                        else:
                            fileout.write(i.strip("\n"))
                            fileout.write("\t0")
                        fileout.write("\n")

                ojob = (nblock, fileout.name)
                filein.close()
                fileout.close()

            if ojob:
                output_queue.put(ojob)

            os.unlink(filein_name)
        else:
            logging.debug("Exiting worker")
            break
コード例 #11
0
def classify(args):
    global nline
    batch_size = 10000
    buf_sent = []
    buf_feat = []
    if args.source_tokeniser_path:
        source_tokeniser = ToolWrapper(args.source_tokeniser_path.split(' '))
    else:
        source_tokeniser = MosesTokenizer(args.source_lang)
    if args.target_tokeniser_path:
        target_tokeniser = ToolWrapper(args.target_tokeniser_path.split(' '))
    else:
        target_tokeniser = MosesTokenizer(args.target_lang)
    for i in args.input:
        nline += 1
        parts = i.split("\t")

        sl_sentence=None
        tl_sentence=None
        if len(parts) >= 4:
            sl_sentence=parts[2]
            tl_sentence=parts[3]

        if len(parts) == 2:
            sl_sentence=parts[0]
            tl_sentence=parts[1]

        if sl_sentence and tl_sentence and len(sl_sentence.strip()) != 0 and len(tl_sentence.strip()) != 0 and wrong_tu(sl_sentence.strip(),tl_sentence.strip(), args)== False:
            lmScore=None
            if args.lm_filter:
                lmScore=args.lm_filter.score(sl_sentence,tl_sentence)
            if lmScore != None and lmScore < args.lm_threshold and not args.keep_lm_result:
                buf_sent.append((0, i,lmScore))
            else:
                buf_sent.append((1, i,lmScore))
                features = feature_extract(sl_sentence, tl_sentence, source_tokeniser, target_tokeniser, args)
                buf_feat.append([float(v) for v in features])
        else:
            lmScore=None
            if args.lm_filter:
                lmScore=0
            buf_sent.append((0, i, lmScore))

        if (nline % batch_size) == 0:
            args.clf.set_params(n_jobs = 1)
            predictions = args.clf.predict_proba(np.array(buf_feat)) if len(buf_feat) > 0 else []
            p = iter(predictions)

            for k, l, lmScore in buf_sent:
                if k == 1:

                    if args.score_only:
                        args.output.write(str(next(p)[1]))
                    else:

                        args.output.write(l.strip())
                        args.output.write("\t")
                        args.output.write(str(next(p)[1]))
                        if lmScore != None and args.keep_lm_result:
                            args.output.write("\t")
                            args.output.write(str(lmScore))
                    args.output.write("\n")
                else:

                    if args.score_only:
                        args.output.write("0")

                    else:
                        args.output.write(l.strip("\n"))
                        args.output.write("\t0")
                        if lmScore != None and args.keep_lm_result:
                            args.output.write("\t0")
                    args.output.write("\n")

            buf_feat = []
            buf_sent = []

    if len(buf_sent) > 0:
        predictions = args.clf.predict_proba(np.array(buf_feat)) if len(buf_feat) > 0 else []
        p = iter(predictions)

        for k, l, lmScore in buf_sent:
            if k == 1:

                if args.score_only:
                    args.output.write(str(next(p)[1]))
                else:

                    args.output.write(l.strip())
                    args.output.write("\t")
                    args.output.write(str(next(p)[1]))
                    if lmScore != None and args.keep_lm_result:
                        args.output.write("\t")
                        args.output.write(str(lmScore))
                args.output.write("\n")
            else:

                if args.score_only:
                    args.output.write("0")

                else:
                    args.output.write(l.strip("\n"))
                    args.output.write("\t0")
                    if lmScore != None and args.keep_lm_result:
                        args.output.write("\t0")
                args.output.write("\n")
コード例 #12
0
def classify(args):
    global nline
    batch_size = 10000
    buf_sent = []
    buf_feat = []

    source_tokenizer = Tokenizer(args.source_tokenizer_command,
                                 args.source_lang)
    target_tokenizer = Tokenizer(args.target_tokenizer_command,
                                 args.target_lang)

    if not args.disable_lm_filter:
        lm_filter = load_lm_filter(args.source_lang, args.target_lang,
                                   args.metadata_yaml,
                                   args.source_tokenizer_command,
                                   args.target_tokenizer_command)
    else:
        lm_filter = None

    if not args.disable_porn_removal:
        porn_removal = args.porn_removal
        if args.metadata_yaml['porn_removal_side'] == 'tl':
            porn_tokenizer = Tokenizer(args.target_tokenizer_command,
                                       args.target_lang)
        else:
            porn_tokenizer = Tokenizer(args.source_tokenizer_command,
                                       args.source_lang)
    else:
        porn_removal = None
        porn_tokenizer = None

    for i in args.input:
        nline += 1
        parts = i.split("\t")

        sl_sentence = None
        tl_sentence = None
        if len(parts) >= max(args.scol, args.tcol):
            sl_sentence = parts[args.scol - 1]
            tl_sentence = parts[args.tcol - 1]
        else:
            logging.error(
                "ERROR: scol ({}) or tcol ({}) indexes above column number ({}) on line {}"
                .format(args.scol, args.tcol, len(parts), nline))

        if sl_sentence and tl_sentence and len(
                sl_sentence.strip()) != 0 and len(
                    tl_sentence.strip()) != 0 and (
                        args.disable_hardrules or wrong_tu(
                            sl_sentence.strip(), tl_sentence.strip(), args,
                            lm_filter, porn_removal, porn_tokenizer) == False):
            buf_sent.append((1, i))
            features = feature_extract(sl_sentence, tl_sentence,
                                       source_tokenizer, target_tokenizer,
                                       args)
            buf_feat.append([float(v) for v in features])
        else:
            buf_sent.append((0, i))

        if (nline % batch_size) == 0:
            args.clf.set_params(n_jobs=1)
            predictions = args.clf.predict_proba(
                np.array(buf_feat)) if len(buf_feat) > 0 else []
            p = iter(predictions)

            for k, l in buf_sent:
                if k == 1:
                    if args.score_only:
                        args.output.write("{0:.3f}".format((next(p)[1])))
                    else:
                        args.output.write(l.strip())
                        args.output.write("\t{0:.3f}".format((next(p)[1])))
                    args.output.write("\n")
                else:
                    if args.score_only:
                        args.output.write("0")
                    else:
                        args.output.write(l.strip("\n"))
                        args.output.write("\t0")
                    args.output.write("\n")

            buf_feat = []
            buf_sent = []

    if len(buf_sent) > 0:
        predictions = args.clf.predict_proba(
            np.array(buf_feat)) if len(buf_feat) > 0 else []
        p = iter(predictions)

        for k, l in buf_sent:
            if k == 1:
                if args.score_only:
                    args.output.write("{0:.3f}".format((next(p)[1])))
                else:
                    args.output.write(l.strip())
                    args.output.write("\t")
                    args.output.write("{0:.3f}".format((next(p)[1])))
                args.output.write("\n")
            else:
                if args.score_only:
                    args.output.write("0")
                else:
                    args.output.write(l.strip("\n"))
                    args.output.write("\t0")
                args.output.write("\n")
コード例 #13
0
def classifier_process(i, jobs_queue, output_queue, args):
    if args.source_tokeniser_path:
        source_tokeniser = ToolWrapper(args.source_tokeniser_path.split(' '))
    else:
        source_tokeniser = MosesTokenizer(args.source_lang)
    if args.target_tokeniser_path:
        target_tokeniser = ToolWrapper(args.target_tokeniser_path.split(' '))
    else:
        target_tokeniser = MosesTokenizer(args.target_lang)
    
    #Load LM for fluency scoring
    lm_filter=None
    if args.source_lm and args.target_lm:
        lm_filter=DualLMFluencyFilter(args.lm_type,args.source_lang, args.target_lang)
        lm_filter.load(args.source_lm, args.target_lm,args.lm_filter_stats)
    
    while True:
        job = jobs_queue.get()
        if job:
            logging.debug("Job {0}".format(job.__repr__()))
            nblock, filein_name = job
            ojob = None
            with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False, dir=args.tmp_dir) as fileout:
                logging.debug("Classification: creating temporary filename {0}".format(fileout.name))
                feats = []
                lm_scores=[]
                
                #Create the following arrays:
                #valid_sentences: boolean, length of input. States whether each sentence passed
                #  hard rules and lm fluency filtering
                #feats: vector of tuples, input features to the classifier, length equals number
                #  of sentences in the input that passed hard rules + lm fluency filtering
                
                valid_sentences=[]
                for i in filein:
                    parts = i.split("\t")
                    sl_sentence=None
                    tl_sentence=None
                    if len(parts) >= 4:
                        sl_sentence=parts[2]
                        tl_sentence=parts[3]
                    if sl_sentence and tl_sentence and len(sl_sentence.strip()) != 0 and len(tl_sentence.strip()) != 0 and wrong_tu(sl_sentence.strip(),tl_sentence.strip(), args)== False:
                        lm_score=None
                        if lm_filter:
                            lm_score=lm_filter.score(sl_sentence,tl_sentence)
                        if lm_filter and lm_score < args.lm_threshold and not args.keep_lm_result:
                            valid_sentences.append(False)
                        else:
                            features = feature_extract(sl_sentence, tl_sentence, source_tokeniser, target_tokeniser, args)
                            feats.append([float(v) for v in features])
                            lm_scores.append(lm_score)        
                            valid_sentences.append(True)
                    else:
                        valid_sentences.append(False)
                    

                predictions = args.clf.predict_proba(np.array(feats)) if len(feats) > 0 else []
                filein.seek(0)

                piter = iter(predictions)
                if lm_filter:
                    lmiter=iter(lm_scores)
                for i, valid_sentence in zip(filein,valid_sentences):                    
                    if valid_sentence:
                        p = next(piter)
                        
                        fileout.write(i.strip())
                        fileout.write("\t")
                        fileout.write(str(p[1]))
                        if lm_filter and args.keep_lm_result:
                            lm_score=next(lmiter)
                            fileout.write("\t")
                            fileout.write(str(lm_score))
                        fileout.write("\n")
                    else:
                        fileout.write(i.strip("\n"))
                        fileout.write("\t0")
                        if lm_filter and args.keep_lm_result:
                            fileout.write("\t0")
                        fileout.write("\n")

                ojob = (nblock, fileout.name)
                filein.close()
                fileout.close()
             
            if ojob:                    
                output_queue.put(ojob)
                
            os.unlink(filein_name)
        else:
            logging.debug("Exiting worker")
            break
コード例 #14
0
import sys
import random
from features import feature_extract

if __name__ == '__main__':
    tot = len(sys.argv[1:])
    nrows = 750
    song_list = sys.argv[1:]
    random.shuffle(song_list)
    feature_extract(song_list[0:nrows])