from domainextractor import DomainExtractor
from runconfiguration import *
from ngramperplexity import NGramPerplexity
import sys

general_corpora_dir = "corpora/big"
specific_corpora_dir = "corpora/small"
output_dir = "corpora/output/monolingual"

NGramPerplexity.ngram_size = 2
RUN_CONFIGURATION.mode = MODE.STATISTICS
if len(sys.argv) > 1:
    RUN_CONFIGURATION.workers = int(sys.argv[1]) # Number of workers
    RUN_CONFIGURATION.worker_id = int(sys.argv[2]) # Current worker id, should be >= 1 and <= number of workers

threshold_tfidf = 0.963 # range [0.0,1.0]: lower value = more similar data extracted
threshold_perplexity_ngram = 458 # range [0.0,infinite]: lower value = more similar data extracted
threshold_edit_distance = 0.80  # range [0.0,1.0]: lower value = more similar data extracted

domain_extractor = DomainExtractor(general_corpora_dir, specific_corpora_dir, output_dir)
domain_extractor.run_mono(threshold_tfidf, threshold_perplexity_ngram, threshold_edit_distance)
Example #2
0
from domainextractor import DomainExtractor
from enums import SIMILARITY_ACCEPT_CRITERIA
from ngramperplexity import NGramPerplexity

general_corpora_dir = "corpora/big"
specific_corpora_dir = "corpora/small"
output_file_en = "corpora/output/output.en"
output_file_pl = "corpora/output/output.pl"

NGramPerplexity.ngram_size = 2
sim_accept_criteria = SIMILARITY_ACCEPT_CRITERIA.BOTH_LANGUAGES

threshold_en_tfidf = 0.963 # range [0.0,1.0]: lower value = more similar data extracted
threshold_en_perplexity_ngram = 458 # range [0.0,infinite]: lower value = more similar data extracted
threshold_en_edit_distance = 0.86  # range [0.0,1.0]: lower value = more similar data extracted
threshold_pl_tfidf = 0.994 # range [0.0,1.0]: lower value = more similar data extracted
threshold_pl_perplexity_ngram = 465.404 # range [0.0,infinite]: lower value = more similar data extracted
threshold_pl_edit_distance = 0.90  # range [0.0,1.0]: lower value = more similar data extracted




domain_extractor = DomainExtractor(general_corpora_dir, specific_corpora_dir, output_file_en, output_file_pl)
domain_extractor.run(threshold_en_tfidf, threshold_en_perplexity_ngram, threshold_en_edit_distance,
                     threshold_pl_tfidf, threshold_pl_perplexity_ngram, threshold_pl_edit_distance,
                     sim_accept_criteria)
from runconfiguration import *
from ngramperplexity import NGramPerplexity
import sys

general_corpora_dir = "corpora/big"
specific_corpora_dir = "corpora/small"
output_dir = "corpora/output/bilingual"

NGramPerplexity.ngram_size = 2
RUN_CONFIGURATION.mode = MODE.STATISTICS
RUN_CONFIGURATION.accept_criteria = ACCEPT_CRITERIA.BILINGUAL_BOTH
if len(sys.argv) > 1:
    RUN_CONFIGURATION.workers = int(sys.argv[1])  # Number of workers
    RUN_CONFIGURATION.worker_id = int(
        sys.argv[2]
    )  # Current worker id, should be >= 1 and <= number of workers

threshold_en_tfidf = 0.963  # range [0.0,1.0]: lower value = more similar data extracted
threshold_en_perplexity_ngram = 458  # range [0.0,infinite]: lower value = more similar data extracted
threshold_en_edit_distance = 0.86  # range [0.0,1.0]: lower value = more similar data extracted
threshold_pl_tfidf = 0.994  # range [0.0,1.0]: lower value = more similar data extracted
threshold_pl_perplexity_ngram = 465.404  # range [0.0,infinite]: lower value = more similar data extracted
threshold_pl_edit_distance = 0.90  # range [0.0,1.0]: lower value = more similar data extracted

domain_extractor = DomainExtractor(general_corpora_dir, specific_corpora_dir,
                                   output_dir)
domain_extractor.run_bilingual(threshold_en_tfidf,
                               threshold_en_perplexity_ngram,
                               threshold_en_edit_distance, threshold_pl_tfidf,
                               threshold_pl_perplexity_ngram,
                               threshold_pl_edit_distance)
from domainextractor import DomainExtractor
from runconfiguration import *
from ngramperplexity import NGramPerplexity
import sys

general_corpora_dir = "corpora/big"
specific_corpora_dir = "corpora/small"
output_dir = "corpora/output/bilingual"

NGramPerplexity.ngram_size = 2
RUN_CONFIGURATION.mode = MODE.STATISTICS
RUN_CONFIGURATION.accept_criteria = ACCEPT_CRITERIA.BILINGUAL_BOTH
if len(sys.argv) > 1:
    RUN_CONFIGURATION.workers = int(sys.argv[1]) # Number of workers
    RUN_CONFIGURATION.worker_id = int(sys.argv[2]) # Current worker id, should be >= 1 and <= number of workers

threshold_en_tfidf = 0.963 # range [0.0,1.0]: lower value = more similar data extracted
threshold_en_perplexity_ngram = 458 # range [0.0,infinite]: lower value = more similar data extracted
threshold_en_edit_distance = 0.86  # range [0.0,1.0]: lower value = more similar data extracted
threshold_pl_tfidf = 0.994 # range [0.0,1.0]: lower value = more similar data extracted
threshold_pl_perplexity_ngram = 465.404 # range [0.0,infinite]: lower value = more similar data extracted
threshold_pl_edit_distance = 0.90  # range [0.0,1.0]: lower value = more similar data extracted

domain_extractor = DomainExtractor(general_corpora_dir, specific_corpora_dir, output_dir)
domain_extractor.run_bilingual(threshold_en_tfidf, threshold_en_perplexity_ngram, threshold_en_edit_distance,
                     threshold_pl_tfidf, threshold_pl_perplexity_ngram, threshold_pl_edit_distance)
from domainextractor import DomainExtractor
from runconfiguration import *
from ngramperplexity import NGramPerplexity
import sys

general_corpora_dir = "corpora/big"
specific_corpora_dir = "corpora/small"
output_dir = "corpora/output/monolingual"

NGramPerplexity.ngram_size = 2
RUN_CONFIGURATION.mode = MODE.STATISTICS
if len(sys.argv) > 1:
    RUN_CONFIGURATION.workers = int(sys.argv[1])  # Number of workers
    RUN_CONFIGURATION.worker_id = int(
        sys.argv[2]
    )  # Current worker id, should be >= 1 and <= number of workers

threshold_tfidf = 0.963  # range [0.0,1.0]: lower value = more similar data extracted
threshold_perplexity_ngram = 458  # range [0.0,infinite]: lower value = more similar data extracted
threshold_edit_distance = 0.80  # range [0.0,1.0]: lower value = more similar data extracted

domain_extractor = DomainExtractor(general_corpora_dir, specific_corpora_dir,
                                   output_dir)
domain_extractor.run_mono(threshold_tfidf, threshold_perplexity_ngram,
                          threshold_edit_distance)