Python Configuration.use_meta_languageの例

プログラミング言語: Python

名前空間/パッケージ名: goose

クラス/型: Configuration

メソッド/関数: use_meta_language

hotexamples.comのコード掲載数: 4

Python Configuration.use_meta_language - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのgoose.Configuration.use_meta_languageの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Configuration(6)

enable_image_fetching(6)

use_meta_language(3)

コード例 #1

ファイルを表示

ファイル: bow.py プロジェクト: MichaelAquilina/Reddit-Recommender-Bot

def generate_feature_matrix(data, stemmer, **prune_params):
    config = Configuration()
    config.enable_image_fetching = False
    config.use_meta_language = False
    goose = Goose(config)

    _parser = HTMLParser()

    sr_index = HashedIndex()

    for url_path, label in data.items():

        if os.path.exists(url_path):
            with open(url_path, 'r') as html_file:
                html_text = html_file.read()

            text = unicode(goose.extract(raw_html=html_text).cleaned_text)
            text = _parser.unescape(text)

            for token in word_tokenize(text, stemmer=stemmer):
                sr_index.add_term_occurrence(token, url_path)

    sr_index.prune(**prune_params)

    X = sr_index.generate_feature_matrix(mode='tfidf')

    y = np.zeros(len(sr_index.documents()))
    for index, doc in enumerate(sr_index.documents()):
        y[index] = 0 if data[doc] is None else 1

    return X, y

コード例 #2

ファイルを表示

ファイル: boc.py プロジェクト: MichaelAquilina/Reddit-Recommender-Bot

def generate_feature_matrix(wiki, data, n_concepts=10, **word_concept_params):
    """
    Transforms a given data source to a corresponding feature matrix and label
    vector based on the "Bag of Concepts" model which uses Wikipedia as an
    exogenous knowledge source for Word Sense Disambiguation and as additional
    domain knowledge.

    Contains logging code which is displayed depending on the currently set
    logging level of the root logger.
    :param wiki: WikiIndex instance to some database index
    :param data: data labels loaded using a load_data_source method
    :param n_concepts: number of concepts to use per page.
    :param word_concept_params: word concept parameters to use for generation of concepts.
    :return: Numpy Feature Matrix and Label Vector.
    """

    config = Configuration()
    config.enable_image_fetching = False
    config.use_meta_language = False
    goose = Goose(config)

    results = {}
    concepts = set()

    # Iterate through the data and perform training
    for index, (abs_path, label) in enumerate(data.items()):
        if not os.path.exists(abs_path):
            continue

        with open(abs_path, 'r') as fp:
            html_text = fp.read()

        # Determine relative path using a simple heuristic
        cutoff = abs_path.find('pages/')
        rel_path = abs_path[cutoff + 6:]

        logging.info('\n%d: http://%s' % (index, rel_path[:-3]))
        article = goose.extract(raw_html=html_text)

        if len(article.cleaned_text) > 500:
            logging.info('%s (%s)', article.title, label)

            search_results, terms, query_vector = wiki.word_concepts(
                article.cleaned_text, article.title, **word_concept_params)

            if search_results:
                results[abs_path] = [(sr.page_id, sr.weight)
                                     for sr in search_results[:n_concepts]]

                # Remove any concepts which have a weight of 0
                results[abs_path] = filter(lambda x: x[1] > 0,
                                           results[abs_path])

                for search_result in search_results[:n_concepts]:
                    concepts.add(search_result.page_id)

                logging.info(search_results[:n_concepts])
            else:
                logging.warn('No word concepts returned')
        else:
            logging.info('Document is of insufficient length')

    shape = (len(results), len(concepts))

    concepts_index = dict([(b, a) for (a, b) in enumerate(concepts)])

    feature_matrix = np.zeros(shape=shape)
    label_vector = np.zeros(len(results))

    for i, (abs_path, page_list) in enumerate(results.iteritems()):
        label_vector[i] = 1 if data[abs_path] is not None else 0

        for page_id, weight in page_list:
            j = concepts_index[page_id]
            feature_matrix[i, j] = weight

    return feature_matrix, label_vector

コード例 #3

ファイルを表示

ファイル: main.py プロジェクト: MichaelAquilina/Reddit-Recommender-Bot

from HTMLParser import HTMLParser

from datasource import load_data_source
from index.hashedindex import HashedIndex, load_meta
from utils import search_files

if __name__ == '__main__':

    import time
    t0 = time.time()

    _parser = HTMLParser()

    _config = Configuration()
    _config.enable_image_fetching = False
    _config.use_meta_language = False

    _goose = Goose(_config)

    # Lancaster Stemmer is very very slow
    _stemmer = textparser.NullStemmer()

    data_path = '/home/michaela/Development/Reddit-Testing-Data'

    # Set the parameters to the program over here
    force_reindex = False
    parameters = {
        'samples': 800,
        'subreddit': 'python',
        'min_frequency': 2,
        'stemmer': str(_stemmer),

コード例 #4

ファイルを表示

ファイル: boc.py プロジェクト: MichaelAquilina/Reddit-Recommender-Bot

def generate_feature_matrix(wiki, data, n_concepts=10, **word_concept_params):
    """
    Transforms a given data source to a corresponding feature matrix and label
    vector based on the "Bag of Concepts" model which uses Wikipedia as an
    exogenous knowledge source for Word Sense Disambiguation and as additional
    domain knowledge.

    Contains logging code which is displayed depending on the currently set
    logging level of the root logger.
    :param wiki: WikiIndex instance to some database index
    :param data: data labels loaded using a load_data_source method
    :param n_concepts: number of concepts to use per page.
    :param word_concept_params: word concept parameters to use for generation of concepts.
    :return: Numpy Feature Matrix and Label Vector.
    """

    config = Configuration()
    config.enable_image_fetching = False
    config.use_meta_language = False
    goose = Goose(config)

    results = {}
    concepts = set()

    # Iterate through the data and perform training
    for index, (abs_path, label) in enumerate(data.items()):
        if not os.path.exists(abs_path):
            continue

        with open(abs_path, 'r') as fp:
            html_text = fp.read()

        # Determine relative path using a simple heuristic
        cutoff = abs_path.find('pages/')
        rel_path = abs_path[cutoff + 6:]

        logging.info('\n%d: http://%s' % (index, rel_path[:-3]))
        article = goose.extract(raw_html=html_text)

        if len(article.cleaned_text) > 500:
            logging.info('%s (%s)', article.title, label)

            search_results, terms, query_vector = wiki.word_concepts(article.cleaned_text, article.title, **word_concept_params)

            if search_results:
                results[abs_path] = [(sr.page_id, sr.weight) for sr in search_results[:n_concepts]]

                # Remove any concepts which have a weight of 0
                results[abs_path] = filter(lambda x: x[1] > 0, results[abs_path])

                for search_result in search_results[:n_concepts]:
                    concepts.add(search_result.page_id)

                logging.info(search_results[:n_concepts])
            else:
                logging.warn('No word concepts returned')
        else:
            logging.info('Document is of insufficient length')

    shape = (len(results), len(concepts))

    concepts_index = dict([(b, a) for (a, b) in enumerate(concepts)])

    feature_matrix = np.zeros(shape=shape)
    label_vector = np.zeros(len(results))

    for i, (abs_path, page_list) in enumerate(results.iteritems()):
        label_vector[i] = 1 if data[abs_path] is not None else 0

        for page_id, weight in page_list:
            j = concepts_index[page_id]
            feature_matrix[i, j] = weight

    return feature_matrix, label_vector