Ejemplos de Configuration.use_meta_language en Python

Lenguaje de programación: Python

Namespace/Package Name: goose

Clase / Tipo: Configuration

Método / Función: use_meta_language

Ejemplos en hotexamples.com: 4

Python Configuration.use_meta_language - 4 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de goose.Configuration.use_meta_language extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

Configuration(6)

enable_image_fetching(6)

use_meta_language(3)

Ejemplo n.º 1

Mostrar archivo

Archivo: bow.py Proyecto: MichaelAquilina/Reddit-Recommender-Bot

def generate_feature_matrix(data, stemmer, **prune_params):
    config = Configuration()
    config.enable_image_fetching = False
    config.use_meta_language = False
    goose = Goose(config)

    _parser = HTMLParser()

    sr_index = HashedIndex()

    for url_path, label in data.items():

        if os.path.exists(url_path):
            with open(url_path, 'r') as html_file:
                html_text = html_file.read()

            text = unicode(goose.extract(raw_html=html_text).cleaned_text)
            text = _parser.unescape(text)

            for token in word_tokenize(text, stemmer=stemmer):
                sr_index.add_term_occurrence(token, url_path)

    sr_index.prune(**prune_params)

    X = sr_index.generate_feature_matrix(mode='tfidf')

    y = np.zeros(len(sr_index.documents()))
    for index, doc in enumerate(sr_index.documents()):
        y[index] = 0 if data[doc] is None else 1

    return X, y

Ejemplo n.º 2

Mostrar archivo

Archivo: boc.py Proyecto: MichaelAquilina/Reddit-Recommender-Bot

def generate_feature_matrix(wiki, data, n_concepts=10, **word_concept_params):
    """
    Transforms a given data source to a corresponding feature matrix and label
    vector based on the "Bag of Concepts" model which uses Wikipedia as an
    exogenous knowledge source for Word Sense Disambiguation and as additional
    domain knowledge.

    Contains logging code which is displayed depending on the currently set
    logging level of the root logger.
    :param wiki: WikiIndex instance to some database index
    :param data: data labels loaded using a load_data_source method
    :param n_concepts: number of concepts to use per page.
    :param word_concept_params: word concept parameters to use for generation of concepts.
    :return: Numpy Feature Matrix and Label Vector.
    """

    config = Configuration()
    config.enable_image_fetching = False
    config.use_meta_language = False
    goose = Goose(config)

    results = {}
    concepts = set()

    # Iterate through the data and perform training
    for index, (abs_path, label) in enumerate(data.items()):
        if not os.path.exists(abs_path):
            continue

        with open(abs_path, 'r') as fp:
            html_text = fp.read()

        # Determine relative path using a simple heuristic
        cutoff = abs_path.find('pages/')
        rel_path = abs_path[cutoff + 6:]

        logging.info('\n%d: http://%s' % (index, rel_path[:-3]))
        article = goose.extract(raw_html=html_text)

        if len(article.cleaned_text) > 500:
            logging.info('%s (%s)', article.title, label)

            search_results, terms, query_vector = wiki.word_concepts(
                article.cleaned_text, article.title, **word_concept_params)

            if search_results:
                results[abs_path] = [(sr.page_id, sr.weight)
                                     for sr in search_results[:n_concepts]]

                # Remove any concepts which have a weight of 0
                results[abs_path] = filter(lambda x: x[1] > 0,
                                           results[abs_path])

                for search_result in search_results[:n_concepts]:
                    concepts.add(search_result.page_id)

                logging.info(search_results[:n_concepts])
            else:
                logging.warn('No word concepts returned')
        else:
            logging.info('Document is of insufficient length')

    shape = (len(results), len(concepts))

    concepts_index = dict([(b, a) for (a, b) in enumerate(concepts)])

    feature_matrix = np.zeros(shape=shape)
    label_vector = np.zeros(len(results))

    for i, (abs_path, page_list) in enumerate(results.iteritems()):
        label_vector[i] = 1 if data[abs_path] is not None else 0

        for page_id, weight in page_list:
            j = concepts_index[page_id]
            feature_matrix[i, j] = weight

    return feature_matrix, label_vector

Ejemplo n.º 3

Mostrar archivo

Archivo: main.py Proyecto: MichaelAquilina/Reddit-Recommender-Bot

from HTMLParser import HTMLParser

from datasource import load_data_source
from index.hashedindex import HashedIndex, load_meta
from utils import search_files

if __name__ == '__main__':

    import time
    t0 = time.time()

    _parser = HTMLParser()

    _config = Configuration()
    _config.enable_image_fetching = False
    _config.use_meta_language = False

    _goose = Goose(_config)

    # Lancaster Stemmer is very very slow
    _stemmer = textparser.NullStemmer()

    data_path = '/home/michaela/Development/Reddit-Testing-Data'

    # Set the parameters to the program over here
    force_reindex = False
    parameters = {
        'samples': 800,
        'subreddit': 'python',
        'min_frequency': 2,
        'stemmer': str(_stemmer),

Ejemplo n.º 4

Mostrar archivo

Archivo: boc.py Proyecto: MichaelAquilina/Reddit-Recommender-Bot

def generate_feature_matrix(wiki, data, n_concepts=10, **word_concept_params):
    """
    Transforms a given data source to a corresponding feature matrix and label
    vector based on the "Bag of Concepts" model which uses Wikipedia as an
    exogenous knowledge source for Word Sense Disambiguation and as additional
    domain knowledge.

    Contains logging code which is displayed depending on the currently set
    logging level of the root logger.
    :param wiki: WikiIndex instance to some database index
    :param data: data labels loaded using a load_data_source method
    :param n_concepts: number of concepts to use per page.
    :param word_concept_params: word concept parameters to use for generation of concepts.
    :return: Numpy Feature Matrix and Label Vector.
    """

    config = Configuration()
    config.enable_image_fetching = False
    config.use_meta_language = False
    goose = Goose(config)

    results = {}
    concepts = set()

    # Iterate through the data and perform training
    for index, (abs_path, label) in enumerate(data.items()):
        if not os.path.exists(abs_path):
            continue

        with open(abs_path, 'r') as fp:
            html_text = fp.read()

        # Determine relative path using a simple heuristic
        cutoff = abs_path.find('pages/')
        rel_path = abs_path[cutoff + 6:]

        logging.info('\n%d: http://%s' % (index, rel_path[:-3]))
        article = goose.extract(raw_html=html_text)

        if len(article.cleaned_text) > 500:
            logging.info('%s (%s)', article.title, label)

            search_results, terms, query_vector = wiki.word_concepts(article.cleaned_text, article.title, **word_concept_params)

            if search_results:
                results[abs_path] = [(sr.page_id, sr.weight) for sr in search_results[:n_concepts]]

                # Remove any concepts which have a weight of 0
                results[abs_path] = filter(lambda x: x[1] > 0, results[abs_path])

                for search_result in search_results[:n_concepts]:
                    concepts.add(search_result.page_id)

                logging.info(search_results[:n_concepts])
            else:
                logging.warn('No word concepts returned')
        else:
            logging.info('Document is of insufficient length')

    shape = (len(results), len(concepts))

    concepts_index = dict([(b, a) for (a, b) in enumerate(concepts)])

    feature_matrix = np.zeros(shape=shape)
    label_vector = np.zeros(len(results))

    for i, (abs_path, page_list) in enumerate(results.iteritems()):
        label_vector[i] = 1 if data[abs_path] is not None else 0

        for page_id, weight in page_list:
            j = concepts_index[page_id]
            feature_matrix[i, j] = weight

    return feature_matrix, label_vector