Ejemplos de LemmatizeText.LemmatizeText en Python, ejemplos de nlplib.lemmatize_text.LemmatizeText.LemmatizeText en Python

Ejemplo n.º 1

0

Mostrar archivo

 def getWords(self, text):
     lemmas = LemmatizeText(self.ct.removePunctuation(text), "FR")
     lemmas.createLemmaText()
     lemmaText = lemmas.cleanText
     words = []
     if lemmaText and lemmaText != " ":
         lemmas.createLemmas()
         for w in lemmas.wordList:
             word = {}
             word['word'] = w.word
             word['tf'] = w.tf
             word['count'] = w.count
             word['pos'] = w.wtype
             words.append(word)
     return words

Ejemplo n.º 2

0

Mostrar archivo

Archivo: cats.py Proyecto: cipriantruica/CATS

def analysis_dashboard_page2():
    keywords = request.form['keyword']
    date = request.form['date']
    checked_genders = request.form.getlist('gender')
    checked_ages = request.form.getlist('age')
    print date, keywords, checked_genders, checked_ages
    lem = LemmatizeText(keywords)
    lem.createLemmaText()
    lem.createLemmas()
    wordList = []
    for word in lem.wordList:
        """
            If you want to use a regex,
            This example will construct a regex that contains the lemma
            similar in SQL to -> where word like '%f**k%'
        """
        #regex = re.compile(word.word, re.IGNORECASE)
        #wordList.append(regex)
        """
            this one will find only the tweets with the matching word
        """
        wordList.append(word.word)
    global query
    query = {}
    global query_pretty
    query_pretty = ""
    if wordList:
        query_pretty += "Keyword filter: " + ' '.join(wordList) + "<br/>"
        query["words.word"] = {"$in": wordList}
    if date:
        query_pretty += "Date filter: " + date + "<br/>"
        start, end = date.split(" ")
        query["date"] = {"$gt": start, "$lte": end}
    if checked_ages and 0 < len(checked_ages) < 6:
        query_pretty += "Age filter: " + ' '.join(checked_ages) + "<br/>"
        query["age"] = {"$in": checked_ages}
    if checked_genders and len(checked_genders) == 1:
        query_pretty += "Gender filter: " + ' '.join(checked_genders) + "<br/>"
        query["gender"] = checked_genders[0]
    if query:
        vocab = VocabularyIndex(dbname)
        vocab.createIndex(query)
    tweetCount = getTweetCount()
    return render_template('analysis.html',
                           tweetCount=tweetCount,
                           dates=date,
                           keywords=' '.join(wordList))

Ejemplo n.º 3

0

Mostrar archivo

Archivo: lemmas.py Proyecto: AdrienGuille/EGC-Cup-2016

    return lemmas


header, corpus = readCSV('RNTI_articles_export_fixed1347_ids.txt')

print header
idx = 0
for line in corpus:
    # language title
    if line[9] == 'fr':
        filename = 'texts/' + str(line[8]) + 'title'
        writeFile(filename, line[3])
        pos_title = extractPOS(filename)
        lemma_title = splitPos(pos_title)
    elif line[9] == 'en':
        lt = LemmatizeText(line[3])
        lt.createLemmaText()
        lemma_title = lt.cleanText
    # language abstract
    if line[10] == 'fr':
        filename = 'texts/' + str(line[8]) + 'abstract'
        writeFile(filename, line[4])
        pos_abstract = extractPOS(filename)
        lemma_abstract = splitPos(pos_abstract)
    elif line[10] == 'en':
        lt = LemmatizeText(line[4])
        lt.createLemmaText()
        lemma_abstract = lt.cleanText
    if line[9] == 'fr' and line[10] == 'fr':
        line[12] = lemma_title + ' ' + lemma_abstract
    if line[9] == 'en' and line[10] == 'en':

Ejemplo n.º 4

0

Mostrar archivo

def process_element(elem):
    document = dict()
    if len(elem) == 9:
        try:
            # construct the document
            # rawText = elem[4].decode('latin-1').encode('utf-8')#.encode('latin-1').encode('string_escape').replace('\r', '').replace('\n', '')
            document['rawText'] = elem[4].encode('latin-1')#.encode('string_escape').replace('\r', '').replace('\n', '')
            document['series'] = elem[0]
            document['booktitle'] = elem[1]
            document['year'] = elem[2]
            document['title'] = elem[3].encode('latin-1')
            #authors
            authors = elem[5].split(',')
            #document['authors'] = [ {'name': author.strip(' ').decode('latin-1').encode('utf-8'), 'position': authors.index(author)} for author in authors]
            document['authors'] = [ {'name': author.strip(' ').encode('latin-1'), 'position': authors.index(author)} for author in authors]
            document['pdf1page'] = elem[6]
            document['pdfarticle'] = elem[7]
            document['_id'] = elem[8]

            try:
                lang = detect(elem[4].decode('latin-1')).upper()
            except Exception as e1:
                try:
                    lang = detect(elem[3].decode('latin-1')).upper()
                    print e1, 'aici try 2'
                except Exception as e2:
                    lang = 'FR'
                    print e2, 'aici try 3'
            document['language'] = lang

            
            if len(elem[4])>0:
                try:
                    cleanText = ct.cleanTextSimple(elem[4].encode('latin-1'), lang)
                    # if clean text exists
                    # print cleanText
                    if len(ct.removePunctuation(cleanText)) > 0:
                        # extract lemmas and part of speech
                        lemmas = LemmatizeText(rawText=ct.removePunctuation(cleanText), language=lang)
                        lemmas.createLemmaText()
                        lemmaText = lemmas.cleanText
                        if lemmaText and lemmaText != " ":
                            lemmas.createLemmas()
                            words = []
                            for w in lemmas.wordList:
                                word = dict()
                                word['word'] = w.word
                                word['tf'] = w.tf
                                word['count'] = w.count
                                word['pos'] = w.wtype
                                words.append(word)

                            document['cleanText'] = cleanText#.encode('latin-1').encode('string_escape').replace('\r', '').replace('\n', '')
                            document['lemmaText'] = lemmaText
                            document['words'] = words
                except Exception as e:
                    print e, 'sunt in lemmaText'
        except Exception as e:
            print e, 'aici try 1', elem
    else:
        print 'aici in else', elem
    return document

Ejemplo n.º 5

0

Mostrar archivo

Archivo: ddl_mongo_new.py Proyecto: cipriantruica/CATS

def processElement_serial(elem, language, mode=0):
    document = dict()
    # get language
    if len(elem) >= 8:
        lang = elem[7]
    else:
        lang = language
    # get clean text
    try:
        cleanText, hashtags, attags = ct.cleanText(elem[1], lang)
        # if clean text exists
        if len(ct.removePunctuation(cleanText)) > 0:
            # extract lemmas and part of speech
            lemmas = LemmatizeText(rawText=ct.removePunctuation(cleanText),
                                   language=lang,
                                   mode=mode)
            lemmas.createLemmaText()
            lemmaText = lemmas.cleanText
            if lemmaText and lemmaText != " ":
                lemmas.createLemmas()
                words = []
                for w in lemmas.wordList:
                    word = dict()
                    word['word'] = w.word
                    word['tf'] = w.tf
                    word['count'] = w.count
                    word['pos'] = w.wtype
                    words.append(word)

                # named entities:
                ner = NamedEntitiesRegonizer(text=cleanText, language=lang)
                ner.createNamedEntities()
                if ner.ner:
                    document['namedEntities'] = ner.ner

                # construct the document
                document['_id'] = elem[0]
                document['rawText'] = elem[1].encode('utf8').encode(
                    'string_escape').replace('\r', '').replace('\n', '')
                document['cleanText'] = cleanText.encode('utf8').encode(
                    'string_escape').replace('\r', '').replace('\n', '')
                document['lemmaText'] = lemmaText
                document['date'] = elem[2]
                document['author'] = elem[3]
                document['words'] = words
                # geo location [x, y]
                document['geoLocation'] = elem[4].split(' ')
                # author age
                # this are the change required for the moment when we will keep age as a number
                # age = elem[5].split('-')
                # document['age'] = int(age[1]) - int(age[0])
                document['age'] = elem[5]

                # this are the changes required for the moment when we will keep gender as a number
                # author gender - 1 male, 2 female, 0 unknown
                # document['gender'] = gender.get(elem[6], 0)
                document['gender'] = elem[6]

                if attags:
                    document['attags'] = attags
                if hashtags:
                    document['hashtags'] = hashtags
    except Exception as e:
        print e
    return document