def getWords(self, text): lemmas = LemmatizeText(self.ct.removePunctuation(text), "FR") lemmas.createLemmaText() lemmaText = lemmas.cleanText words = [] if lemmaText and lemmaText != " ": lemmas.createLemmas() for w in lemmas.wordList: word = {} word['word'] = w.word word['tf'] = w.tf word['count'] = w.count word['pos'] = w.wtype words.append(word) return words
def analysis_dashboard_page2(): keywords = request.form['keyword'] date = request.form['date'] checked_genders = request.form.getlist('gender') checked_ages = request.form.getlist('age') print date, keywords, checked_genders, checked_ages lem = LemmatizeText(keywords) lem.createLemmaText() lem.createLemmas() wordList = [] for word in lem.wordList: """ If you want to use a regex, This example will construct a regex that contains the lemma similar in SQL to -> where word like '%f**k%' """ #regex = re.compile(word.word, re.IGNORECASE) #wordList.append(regex) """ this one will find only the tweets with the matching word """ wordList.append(word.word) global query query = {} global query_pretty query_pretty = "" if wordList: query_pretty += "Keyword filter: " + ' '.join(wordList) + "<br/>" query["words.word"] = {"$in": wordList} if date: query_pretty += "Date filter: " + date + "<br/>" start, end = date.split(" ") query["date"] = {"$gt": start, "$lte": end} if checked_ages and 0 < len(checked_ages) < 6: query_pretty += "Age filter: " + ' '.join(checked_ages) + "<br/>" query["age"] = {"$in": checked_ages} if checked_genders and len(checked_genders) == 1: query_pretty += "Gender filter: " + ' '.join(checked_genders) + "<br/>" query["gender"] = checked_genders[0] if query: vocab = VocabularyIndex(dbname) vocab.createIndex(query) tweetCount = getTweetCount() return render_template('analysis.html', tweetCount=tweetCount, dates=date, keywords=' '.join(wordList))
return lemmas header, corpus = readCSV('RNTI_articles_export_fixed1347_ids.txt') print header idx = 0 for line in corpus: # language title if line[9] == 'fr': filename = 'texts/' + str(line[8]) + 'title' writeFile(filename, line[3]) pos_title = extractPOS(filename) lemma_title = splitPos(pos_title) elif line[9] == 'en': lt = LemmatizeText(line[3]) lt.createLemmaText() lemma_title = lt.cleanText # language abstract if line[10] == 'fr': filename = 'texts/' + str(line[8]) + 'abstract' writeFile(filename, line[4]) pos_abstract = extractPOS(filename) lemma_abstract = splitPos(pos_abstract) elif line[10] == 'en': lt = LemmatizeText(line[4]) lt.createLemmaText() lemma_abstract = lt.cleanText if line[9] == 'fr' and line[10] == 'fr': line[12] = lemma_title + ' ' + lemma_abstract if line[9] == 'en' and line[10] == 'en':
def process_element(elem): document = dict() if len(elem) == 9: try: # construct the document # rawText = elem[4].decode('latin-1').encode('utf-8')#.encode('latin-1').encode('string_escape').replace('\r', '').replace('\n', '') document['rawText'] = elem[4].encode('latin-1')#.encode('string_escape').replace('\r', '').replace('\n', '') document['series'] = elem[0] document['booktitle'] = elem[1] document['year'] = elem[2] document['title'] = elem[3].encode('latin-1') #authors authors = elem[5].split(',') #document['authors'] = [ {'name': author.strip(' ').decode('latin-1').encode('utf-8'), 'position': authors.index(author)} for author in authors] document['authors'] = [ {'name': author.strip(' ').encode('latin-1'), 'position': authors.index(author)} for author in authors] document['pdf1page'] = elem[6] document['pdfarticle'] = elem[7] document['_id'] = elem[8] try: lang = detect(elem[4].decode('latin-1')).upper() except Exception as e1: try: lang = detect(elem[3].decode('latin-1')).upper() print e1, 'aici try 2' except Exception as e2: lang = 'FR' print e2, 'aici try 3' document['language'] = lang if len(elem[4])>0: try: cleanText = ct.cleanTextSimple(elem[4].encode('latin-1'), lang) # if clean text exists # print cleanText if len(ct.removePunctuation(cleanText)) > 0: # extract lemmas and part of speech lemmas = LemmatizeText(rawText=ct.removePunctuation(cleanText), language=lang) lemmas.createLemmaText() lemmaText = lemmas.cleanText if lemmaText and lemmaText != " ": lemmas.createLemmas() words = [] for w in lemmas.wordList: word = dict() word['word'] = w.word word['tf'] = w.tf word['count'] = w.count word['pos'] = w.wtype words.append(word) document['cleanText'] = cleanText#.encode('latin-1').encode('string_escape').replace('\r', '').replace('\n', '') document['lemmaText'] = lemmaText document['words'] = words except Exception as e: print e, 'sunt in lemmaText' except Exception as e: print e, 'aici try 1', elem else: print 'aici in else', elem return document
def processElement_serial(elem, language, mode=0): document = dict() # get language if len(elem) >= 8: lang = elem[7] else: lang = language # get clean text try: cleanText, hashtags, attags = ct.cleanText(elem[1], lang) # if clean text exists if len(ct.removePunctuation(cleanText)) > 0: # extract lemmas and part of speech lemmas = LemmatizeText(rawText=ct.removePunctuation(cleanText), language=lang, mode=mode) lemmas.createLemmaText() lemmaText = lemmas.cleanText if lemmaText and lemmaText != " ": lemmas.createLemmas() words = [] for w in lemmas.wordList: word = dict() word['word'] = w.word word['tf'] = w.tf word['count'] = w.count word['pos'] = w.wtype words.append(word) # named entities: ner = NamedEntitiesRegonizer(text=cleanText, language=lang) ner.createNamedEntities() if ner.ner: document['namedEntities'] = ner.ner # construct the document document['_id'] = elem[0] document['rawText'] = elem[1].encode('utf8').encode( 'string_escape').replace('\r', '').replace('\n', '') document['cleanText'] = cleanText.encode('utf8').encode( 'string_escape').replace('\r', '').replace('\n', '') document['lemmaText'] = lemmaText document['date'] = elem[2] document['author'] = elem[3] document['words'] = words # geo location [x, y] document['geoLocation'] = elem[4].split(' ') # author age # this are the change required for the moment when we will keep age as a number # age = elem[5].split('-') # document['age'] = int(age[1]) - int(age[0]) document['age'] = elem[5] # this are the changes required for the moment when we will keep gender as a number # author gender - 1 male, 2 female, 0 unknown # document['gender'] = gender.get(elem[6], 0) document['gender'] = elem[6] if attags: document['attags'] = attags if hashtags: document['hashtags'] = hashtags except Exception as e: print e return document