def load_topic_tokens(file_path,
                      encoding='latin-1',
                      strip_html_tags=True,
                      strip_html_entities=True,
                      strip_square_bracket_tags=True,
                      preprocess=create_preprocessor()):
    """Generator which provides a list of topics along with the
    preprocessed and tokenized fields
    """
    result = []
    topics = __regex_parse_topics_from_file(file_path)

    for topic in topics:
        processed = {}

        for field in ['title', 'narr', 'desc']:
            content = topic[field]
            words = split_words(
                content,
                strip_html_tags=strip_html_tags,
                strip_html_entities=strip_html_entities,
                strip_square_bracket_tags=strip_square_bracket_tags)

            terms = preprocess(words)
            processed[field] = set(terms)

        result.append(
            Topic(topic['id'], processed['title'], processed['narr'],
                  processed['desc']))

    return result
Beispiel #2
0
    def run_eval(ranking_method, params={}):
        preprocessor = create_preprocessor(
            enable_case_folding=enable_case_folding,
            enable_remove_stop_words=enable_remove_stop_words,
            enable_stemmer=enable_stemmer,
            enable_lemmatizer=enable_lemmatizer,
            min_length=min_word_length)

        click.echo(f'Loading topics from {topics_file}')
        topics = load_topic_tokens(
            topics_file,
            preprocess=preprocessor,
            strip_html_tags=enable_strip_html_tags,
            strip_html_entities=enable_strip_html_entities,
            strip_square_bracket_tags=enable_strip_square_bracket_tags)
        click.echo('done')

        click.echo(f'Loading document stats from {stats_file}')
        document_stats = load_document_stats(stats_file)
        click.echo('done')

        click.echo(f'Loading search index from {index_file}')
        click.echo('This might take a while')
        start = time.time()
        number_of_documents, index_reader_generator = create_index_reader(
            index_file)
        index_reader = index_reader_generator()
        index = list(index_reader)
        click.echo(f'done in {time.time() - start} seconds')

        generate_qrel(number_of_documents, index, document_stats, topics,
                      output_file, ranking_method, run_name, params)
Beispiel #3
0
def use_model(dataset, params, saved_model_filename):
    feature_fields = deepcopy(params['features'])
    feature_fields.append(params['index'])

    real_features, y = get_features_and_labels(dataset, feature_fields)

    preprocess_data_func = preprocess_data_function()

    try:
        myMod = compile(preprocess_data_func, '', 'exec')
        exec(myMod, globals())
        real_features = preprocess_data(real_features)
    except Exception:
        print("System cannot run user code")

    real_X = real_features[params['features']]

    # create preprocessor for pipeline
    preprocessor = create_preprocessor(real_X)

    # load model
    pipeline = pickle.load(open(saved_model_filename, 'rb'))

    predictions = pipeline.predict(real_X)

    # print(real_features[params['index']])

    # print(real_X[params['index']])
    output = pd.DataFrame()
    output = pd.DataFrame({
        params['index']: real_features[params['index']],
        params['label']: predictions
    })

    return output
Beispiel #4
0
def cli(ctx, document_folder, index_file, stats_file,
        enable_case_folding, enable_stemmer, enable_lemmatizer,
        enable_remove_stop_words, min_word_length,
        enable_strip_html_tags, enable_strip_html_entities,
        enable_strip_square_bracket_tags):
    nltk.download('wordnet')

    preprocessor = create_preprocessor(enable_case_folding=enable_case_folding,
                                       enable_remove_stop_words=enable_remove_stop_words,
                                       enable_stemmer=enable_stemmer,
                                       enable_lemmatizer=enable_lemmatizer,
                                       min_length=min_word_length)

    glob_pattern = document_folder + '/**'
    document_files = [fname for fname in glob.glob(glob_pattern, recursive=True) if os.path.isfile(fname)]

    click.echo()
    click.echo('Processing {} file(s)'.format(len(document_files)))
    click.echo()

    ctx.obj['INDEX_FILE'] = index_file
    ctx.obj['STATS_FILE'] = stats_file
    ctx.obj['DOCUMENT_FILES'] = document_files
    ctx.obj['PREPROCESSOR'] = preprocessor

    ctx.obj['STRIP_HTML_TAGS'] = enable_strip_html_tags
    ctx.obj['STRIP_HTML_ENTITIES'] = enable_strip_html_entities
    ctx.obj['STRIP_SQUARE_BRACKET_TAGS'] = enable_strip_square_bracket_tags
Beispiel #5
0
def generate_tokens_for_files(filepaths,
                              encoding='latin-1',
                              use_regex_parser=True,
                              strip_html_tags=True,
                              strip_html_entities=True,
                              strip_square_bracket_tags=True,
                              preprocess=create_preprocessor()):
    """Generator which provides a list of (doc_id, term) pairs for documents
    contained in the given files
    """

    num_documents_processed = 0
    for filepath in tqdm(filepaths, total=len(filepaths)):
        documents = None

        if use_regex_parser:
            documents = __regex_parse_documents_from_file(filepath)
        else:
            documents = __xml_parse_documents_from_file(filepath)

        for document in documents:
            num_documents_processed += 1

            (doc_id, content) = document

            words = split_words(
                content,
                strip_html_tags=strip_html_tags,
                strip_html_entities=strip_html_entities,
                strip_square_bracket_tags=strip_square_bracket_tags)

            terms = preprocess(words)

            for term in terms:
                yield (doc_id, term, num_documents_processed)
Beispiel #6
0
    def run_eval(ranking_method, params={}):
        preprocess = create_preprocessor(
            enable_case_folding=enable_case_folding,
            enable_remove_stop_words=enable_remove_stop_words,
            enable_stemmer=enable_stemmer,
            enable_lemmatizer=enable_lemmatizer,
            min_length=min_word_length)

        words = split_words(
            query,
            strip_html_tags=enable_strip_html_tags,
            strip_html_entities=enable_strip_html_entities,
            strip_square_bracket_tags=enable_strip_square_bracket_tags)

        search_terms = preprocess(words)

        click.echo(f'Searching for "{query}" using "{ranking_method}"')
        click.echo(f'Words: "{words}"')
        click.echo(f'Terms: "{search_terms}"')

        click.echo(f'Loading document stats from {stats_file}')
        document_stats = load_document_stats(stats_file)
        click.echo('done')

        click.echo(f'Loading search index from {index_file}')
        click.echo('This might take a while')
        start = time.time()
        number_of_documents, index_reader_generator = create_index_reader(
            index_file)
        index_reader = index_reader_generator()
        index = list(index_reader)
        click.echo(f'done in {time.time() - start} seconds')

        document_scores = None

        if ranking_method == 'tfidf':
            document_scores = simple_tfidf_search(number_of_documents, index,
                                                  search_terms)
        elif ranking_method == 'cosine_tfidf':
            document_scores = cosine_tfidf_search(number_of_documents, index,
                                                  search_terms)
        elif ranking_method == 'bm25':
            document_scores = simple_bm25_search(number_of_documents,
                                                 index,
                                                 search_terms,
                                                 document_stats,
                                                 k1=params['k1'],
                                                 b=params['b'],
                                                 k3=params['k3'])
        elif ranking_method == 'bm25va':
            document_scores = simple_bm25va_search(number_of_documents,
                                                   index,
                                                   search_terms,
                                                   document_stats,
                                                   k1=params['k1'],
                                                   k3=params['k3'])

        for document_score in document_scores[:50]:
            print(f'{document_score[1]}\t{document_score[0]}')
def add_preprocessor():
    """Handle request for adding a new preprocessor.

    Returns:
        An string ID of the preprocessor.
    """

    json_data = _get_json_from_request()

    error_log = validate_cfg(json_data, STATE, prepro=True)
    if error_log != {}:
        return json.dumps({'log': error_log})

    prepro = create_preprocessor(json_data)
    idx = STATE.add_preprocessor(prepro)
    return json.dumps({'id': idx})
Beispiel #8
0
def train_model(dataset,
                fields,
                saved_model_filename,
                algorithm='decision_tree',
                task_type='classification'):
    data = pd.read_csv(dataset)

    # get features and labels
    features, y_train = get_features_and_labels(data,
                                                fields['features'],
                                                fields['label'],
                                                task_type=task_type)

    preprocess_data_func = preprocess_data_function()

    try:
        myMod = compile(preprocess_data_func, '', 'exec')
        exec(myMod, globals())
        features = preprocess_data(features)
    except Exception:
        print("System cannot run user code")

    # features = preprocess_data(features)
    X_train = features[fields['features']]

    # create preprocessor for pipeline
    preprocessor = create_preprocessor(X_train)

    # train model
    if task_type == 'classification':
        if algorithm == 'decision_tree':
            model = DecisionTreeClassifier(random_state=241)
        elif algorithm == 'logistic_regression':
            model = LogisticRegression()
        elif algorithm == 'knn':
            model = KNeighborsClassifier()
        elif algorithm == 'svm':
            model = svm.SVC(random_state=241)
        elif algorithm == 'naive_bayes':
            model = GaussianNB()
        elif algorithm == 'sgd':
            model = SGDClassifier(random_state=241)
        elif algorithm == 'mlp':
            model = MLPClassifier(random_state=241)
        elif algorithm == 'rf':
            model = RandomForestClassifier(random_state=241)
        elif algorithm == 'gradient_boosting':
            model = GradientBoostingClassifier(random_state=241)

    if task_type == 'regression':
        if algorithm == 'decision_tree':
            model = DecisionTreeRegressor(random_state=241)
        elif algorithm == 'linear_regression':
            model = LinearRegression()
        elif algorithm == 'knn':
            model = KNeighborsRegressor()
        elif algorithm == 'svm':
            model = svm.SVR(random_state=241)
        elif algorithm == 'mlp':
            model = MLPRegressor(random_state=241)
        elif algorithm == 'rf':
            model = RandomForestRegressor(random_state=241)
        elif algorithm == 'gradient_boosting':
            model = GradientBoostingRegressor(random_state=241)

    scaler_name = 'robust'
    if scaler_name == 'robust':
        scaler = RobustScaler()
    elif scaler_name == 'min_max':
        scaler = MinMaxScaler()
    elif scaler_name == 'max_abs':
        scaler = MaxAbsScaler()
    elif scaler_name == 'standard':
        scaler = StandardScaler()

    pipeline = Pipeline(steps=[('preprocessor',
                                preprocessor), ('scaler',
                                                scaler), ('model', model)])

    pipeline.fit(X_train, y_train)

    # save the model to disk
    pickle.dump(pipeline, open(saved_model_filename, 'wb'))

    scoring = None
    if task_type == 'regression':
        scoring = 'neg_mean_absolute_error'
    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring=scoring)
    return score
from preprocessing import create_preprocessor, split_words
from evaluation import generate_qrel, load_topic_tokens
from indexing import create_index_reader, load_document_stats
import gc
import time

index_filepath = 'spimi.index'
stats_filepath = 'spimi.stats'
topics_filepath = './data/TREC8all/topicsTREC8Adhoc.txt'

preprocessor = create_preprocessor(enable_case_folding=True,
                                   enable_remove_stop_words=True,
                                   enable_stemmer=True,
                                   enable_lemmatizer=False,
                                   min_length=2)

print('Loading topics from', topics_filepath)
topics = load_topic_tokens(topics_filepath, preprocess=preprocessor)
print('Searching', len(topics), 'topics')

print('Loading document stats')
document_stats = load_document_stats(stats_filepath)
print('done')

print('Loading search index')
start = time.time()
number_of_documents, index_reader_generator = create_index_reader(index_filepath)
index_reader = index_reader_generator()
index = list(index_reader)
print('done in', time.time() - start, 'seconds')
Beispiel #10
0
def generate_tokens_for_files_distributed(filepaths,
                                          encoding='latin-1',
                                          use_regex_parser=True,
                                          strip_html_tags=True,
                                          strip_html_entities=True,
                                          strip_square_bracket_tags=True,
                                          preprocess=create_preprocessor()):
    """Generator which provides a list of (doc_id, term) pairs for documents
    contained in the given files
    """

    num_documents_processed = 0
    segment_path = "./segmented_files/"
    partitions = ["aa", "bc", "de", "fh", "ij", "km", "nq", "rs", "tu", "vz"]
    segments = []

    for _ in partitions:
        segments.append([])

    for filepath in tqdm(filepaths, total=len(filepaths)):
        documents = None

        if use_regex_parser:
            documents = __regex_parse_documents_from_file(filepath)
        else:
            documents = __xml_parse_documents_from_file(filepath)

        for document in documents:
            num_documents_processed += 1
            (doc_id, content) = document
            words = split_words(
                content,
                strip_html_tags=strip_html_tags,
                strip_html_entities=strip_html_entities,
                strip_square_bracket_tags=strip_square_bracket_tags)

            terms = preprocess(words)

            for term in terms:
                c = term[0]
                for index, partition in enumerate(partitions):
                    if index == segments.__len__() - 1:
                        segments[index].append(term + " " + doc_id)
                        break
                    else:
                        if c <= partition[1]:
                            segments[index].append(term + " " + doc_id)
                            break

    with open(segment_path + "meta_" + process_id().__str__(), "a") as file:
        file.write("{}\n".format(num_documents_processed))

    for index, segment in enumerate(segments):
        #write tokenized documents to file
        file = open(
            segment_path + partitions[index] + "_" + process_id().__str__(),
            "a")
        for t in segment:
            file.write(t.__str__())
            file.write("\n")
        file.flush()
        file.close()