Ejemplos de convert_compound en Python

Lenguaje de programación: Python

Namespace/Package Name: module.text.preprocessing

Método / Función: convert_compound

Ejemplos en hotexamples.com: 4

Python convert_compound - 4 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de module.text.preprocessing.convert_compound extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

Archivo: preprocess.py Proyecto: kensk8er/MsTweetAnalysis

def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None):
    """


    :rtype : gensim.corpora.dictionary.Dictionary
    :param corpora: 
    :param stopwords: 
    :param allowed_pos: 
    :param max_doc: 
    :return: 
    """
    logging.info('Lemmatizing the corpora...')
    count = 0
    corpus_num = len(corpora)
    processed_corpora = []
    corpus_id2orig_id = []

    for index, corpus in corpora.items():
        count += 1
        if count > max_doc:
            break
        if corpus is None:  # skip if corpus is None
            continue

        print '\r', count, '/', corpus_num,
        cleaned_corpus = clean_text(corpus)  # delete irrelevant characters
        corpus = []
        tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos)
        for token in tokens:
            word, pos = token.split('/')
            corpus.append(word)

        # convert compound word into one token
        corpus = convert_compound(corpus)

        # filter stop words, long words, and non-english words
        corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()]
        processed_corpora.append(corpus)
        corpus_id2orig_id.append(index)

    print '\n'

    logging.info('Creating dictionary and corpus...')
    dictionary = Dictionary(processed_corpora)
    dictionary.corpus_id2orig_id = corpus_id2orig_id

    logging.info('Filtering unimportant terms...')
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
    dictionary.compactify()

    logging.info('Generating corpus...')
    dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora]
    dictionary.id2token = revdict(dictionary.token2id)

    return dictionary

Ejemplo n.º 2

Mostrar archivo

Archivo: preprocessWiki.py Proyecto: kensk8er/MsTweetAnalysis

    count = 0
    doc_num = len(wikis)
    new_wikis = []
    keywords = []
    for keyword, wiki in wikis.items():
        count += 1

        print '\r', count, '/', doc_num,
        text = wiki['text']
        cleaned = clean_text(text)  # delete irrelevant characters

        wiki = []
        tokens = lemmatize(content=cleaned, allowed_tags=allowed_pos)  # lemmatize
        for token in tokens:
            word, pos = token.split('/')
            wiki.append(word)

        # convert compound word into one token
        wiki = convert_compound(wiki)

        # filter stop words, long words, and non-english words
        wiki = [w for w in wiki if not w in stopwords and 2 <= len(w) <= 15 and w.islower()]  # FIXME: this allows non-english characters to be stored

        new_wikis.append(wiki)
        keywords.append(keyword)

    print '\n'

    logging.info('Saving wiki corpus...')
    enpickle(new_wikis, 'data/processed/wikis.pkl')

Ejemplo n.º 3

Mostrar archivo

        print '\r', count, '/', doc_num,
        text = document['text'] + (' ' + index) * title_weight  # incorporate title information
        from_name = document['from']
        date = document['date']

        cleaned = clean_text(text)  # delete irrelevant characters

        document = []
        tokens = lemmatize(content=cleaned, allowed_tags=allowed_pos)  # lemmatize
        for token in tokens:
            word, pos = token.split('/')
            document.append(word)

        # convert compound word into one token
        document = convert_compound(document)

        # filter stop words, long words, and non-english words
        document = [w for w in document if not w in stop_words and 2 <= len(w) <= 15 and w.islower()]

        new_documents.append(document)
        titles.append(index)
        froms.append(from_name)
        dates.append(date)

    print '\n'
    logging.info('create dictionary and corpus...')
    dictionary = corpora.Dictionary(new_documents)
    dictionary.docid2title = titles
    dictionary.docid2from = froms
    dictionary.docid2date = dates

Ejemplo n.º 4

Mostrar archivo

    for keyword, wiki in wikis.items():
        count += 1

        print '\r', count, '/', doc_num,
        text = wiki['text']
        cleaned = clean_text(text)  # delete irrelevant characters

        wiki = []
        tokens = lemmatize(content=cleaned,
                           allowed_tags=allowed_pos)  # lemmatize
        for token in tokens:
            word, pos = token.split('/')
            wiki.append(word)

        # convert compound word into one token
        wiki = convert_compound(wiki)

        # filter stop words, long words, and non-english words
        wiki = [
            w for w in wiki
            if not w in stop_words and 2 <= len(w) <= 15 and w.islower()
        ]  # FIXME: it allows non-english characters to be stored

        new_wikis.append(wiki)
        keywords.append(keyword)

    print '\n'
    enpickle(new_wikis, 'data/txt/processed_wiki.pkl')

    logging.info('create dictionary and corpus...')
    dictionary = corpora.Dictionary(new_wikis)