def detect_lang(text, lang_code_hint=None): """ Try to detect the language of a given text. :param ustring text: Body of text of which we want to detect the language :param string lang_code_hint: Expected language code. May give bias towards the expected language :rtype: tuple :returns: detected language name in English, language code, reliability flag, list of possible languages - lists details for other possible matches of the form [('ENGLISH', 'en', text percentage, normalized score), ...] """ try: detected_lang, detected_lang_code, is_reliable, textBytesFound, details = cld.detect( text, hintLanguageCode=lang_code_hint, pickSummaryLanguage=False, isPlainText=False, removeWeakMatches=False ) except UnicodeEncodeError: detected_lang, detected_lang_code, is_reliable, textBytesFound, details = cld.detect( text.encode("utf-8"), hintLanguageCode=lang_code_hint, pickSummaryLanguage=False, isPlainText=False, removeWeakMatches=False, ) except cld.error as e: raise Exception(str(err)) possible_matches = [] for d in details: possible_matches.append((d[0].title(), d[1], d[2])) return (detected_lang.title(), detected_lang_code, is_reliable, possible_matches)
def process(self, file_data): with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m: file_mime_type = m.id_buffer(file_data['contents']) metadata = {} if file_mime_type == 'text/plain': text = file_data['contents'] elif file_mime_type == 'text/html': text = parse_html(file_data['contents'], True, ['script', 'style']) elif file_mime_type == 'application/pdf': text, metadata = extract_pdf(file_data['contents']) else: # If we can't detect the mimetype we add a flag that can be read by # the frontend to provide more information on why the document # wasn't processed. # XXX: We're returning an empty text because if we don't the # pipeline will run indefinitely. The right approach is to make # pypelinin understand an specific exception (something like # StopPipeline) as a signal to stop processing this pipeline. return { 'mimetype': 'unknown', 'text': "", 'file_metadata': {}, 'language': "" } text, forced_decoding = trial_decode(text) if isinstance(text, unicode): # HTMLParser only handles unicode objects. We can't pass the text # through it if we don't know the encoding, and it's possible we # also shouldn't. There's no way of knowing if it's a badly encoded # html or a binary blob that happens do have bytes that look liked # html entities. text = HTMLParser().unescape(text) text = clean(text) if isinstance(text, unicode): language = cld.detect(text.encode('utf-8'))[1] else: language = cld.detect(text)[1] return { 'text': text, 'file_metadata': metadata, 'language': language, 'mimetype': file_mime_type, 'forced_decoding': forced_decoding }
def clean(self,bid=None): super(Donation,self).clean() if self.domain == 'LOCAL': # local donations are always complete, duh if not self.donor: raise ValidationError('Local donations must have a donor') self.transacationstate = 'COMPLETED' if not self.donor and self.transactionstate != 'PENDING': raise ValidationError('Donation must have a donor when in a non-pending state') if not self.domainId and self.donor and self.timereceived: self.domainId = str(calendar.timegm(self.timereceived.timetuple())) + self.donor.email bids = set() if bid: bids |= set([bid]) bids |= set()|set(self.bids.all()) bids = map(lambda b: b.amount,bids) bidtotal = reduce(lambda a,b: a+b,bids,Decimal('0')) if self.amount and bidtotal > self.amount: raise ValidationError('Bid total is greater than donation amount: %s > %s' % (bidtotal,self.amount)) tickets = self.tickets.all() ticketTotal = reduce(lambda a,b: a+b, map(lambda b: b.amount, tickets), Decimal('0')) if self.amount and ticketTotal > self.amount: raise ValidationError('Prize ticket total is greater than donation amount: %s > %s' % (ticketTotal,self.amount)) if self.comment and cld: if self.commentlanguage == 'un' or self.commentlanguage == None: detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect(self.comment.encode('utf-8'), hintLanguageCode ='en') if detectedLangCode in map(lambda x: x[0], LanguageChoices): self.commentlanguage = detectedLangCode else: self.commentlanguage = 'un' else: self.commentlanguage = 'un'
def download_article_file(articleURL, articleFileDirectory, code): articleFilePath = articleFileDirectory + code # Download the article and save as file if (articleURL == ""): print "ERROR: Empty URL detected! File not created" return None else: # If a directory for files doesn't exist, create it dir = os.path.dirname(articleFileDirectory) if not os.path.isdir(dir): #print "Created directory: " + dir os.makedirs(dir) try: #fullArticle = urllib2.urlopen(articleURL) #fullArticleText = fullArticle.read() # Use boilerpipe to remove boilerplate and formatting extractor = Extractor(extractor='ArticleExtractor', url=articleURL) fullArticleText = extractor.getText() # Test to see if article is in English. If not, then return None top_language = cld.detect(fullArticleText.encode('utf-8'))[0] if (top_language != 'ENGLISH'): print "SKIPPED: Article is in " + top_language return None outfile = open(articleFilePath, 'w+') outfile.write(fullArticleText.encode('ascii', 'ignore')) outfile.close # Use lxml's HTML cleaner to remove markup #htmltree = lxml.html.fromstring(fullArticleText) #cleaner = lxml.html.clean.Cleaner(remove_unknown_tags=True) #cleaned_tree = cleaner.clean_html(htmltree) #return cleaned_tree.text_content() return fullArticleText except urllib2.HTTPError: print "ERROR: HTTPError. Article file download skipped: " + articleURL return None except urllib2.URLError: print "ERROR: URLError. Article file download skipped: " + articleURL return None except LookupError: print "ERROR: LookupError. Article file download skipped: " + articleURL return None except UnicodeDecodeError: print "ERROR: UnicodeDecodeError. Article file download skipped: " + articleURL return None except: print "ERROR: ", sys.exc_info()[0] return None
def check_german(tweet_text): emoji_key = pandas.read_csv('DATA/emoji_table.txt', encoding='utf-8', index_col=0) emoji_key['count'] = 0 emoji_dict = emoji_key['count'].to_dict() emoji_dict = emoji_key['count'].to_dict() emoji_dict_total = emoji_key['count'].to_dict() emoji_list = emoji_dict.keys() tweet_text = unicode(tweet_text, 'utf-8') tweet_text = tweet_text.encode('utf-8') tokens = tweet_text.split(' ') new_text = '' #delete @username for token in tokens: if '@' not in token: new_text += token + ' ' new_text = new_text.lower() text = unicode(new_text, 'utf-8') text = text.encode('utf-8') top_language_name = cld.detect(text) lang_form_langid = langid.classify(text) if new_text == '': return True #if text emty - german if top_language_name[0] == 'GERMAN' or lang_form_langid[0] == 'de': return True else: return False
def langDetect(s): import cld langsSeen = set() detLangsSeen = set() detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect( s, pickSummaryLanguage=True, removeWeakMatches=False) if DEBUG: log('CLD :') log(' detected: %s' % detectedLangName) log(' reliable: %s' % (isReliable != 0)) log(' textBytes: %s' % textBytesFound) log(' details: %s' % str(details)) for tup in details: detLangsSeen.add(tup[0]) log(' %d langs; %d ever detected' % (len(langsSeen), len(detLangsSeen))) log("\n") if detectedLangName == 'Unknown': return 'Unknown' else: return [i[1] for i in details]
def runOne(self, expectedLangName, s, shouldBeReliable=True): if VERBOSE: print print 'Test: %s [%d bytes]' % (expectedLangName, len(s)) detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect( s, pickSummaryLanguage=True, removeWeakMatches=False) if VERBOSE: print ' detected: %s' % detectedLangName print ' reliable: %s' % (isReliable != 0) print ' textBytes: %s' % textBytesFound print ' details: %s' % str(details) self.langsSeen.add(expectedLangName) for tup in details: self.detLangsSeen.add(tup[0]) print ' %d langs; %d ever detected' % (len( self.langsSeen), len(self.detLangsSeen)) if False: if expectedLangName == 'YIDDISH': l = list(self.detLangsSeen) l.sort() for i, name in enumerate(l): print ' PyTuple_SET_ITEM(pyDetLangs, %d, PyString_FromString("%s"));' % ( i, name) self.assertEquals( expectedLangName, detectedLangName, '%s != %s; details: %s' % (detectedLangName, expectedLangName, str(details))) self.assertTrue(not shouldBeReliable or isReliable)
def check_german(tweet_text): if isinstance(tweet_text, unicode) is False: tweet_text = unicode(tweet_text, 'utf-8') tweet_text = tweet_text.encode('utf-8') tokens = tweet_text.split(' ') new_text = '' #delete @username for token in tokens: if '@' not in token: new_text += token + ' ' new_text = new_text.lower() if isinstance(new_text, unicode) is False: text = unicode(new_text, 'utf-8') text = text.encode('utf-8') else: text = new_text.encode('utf-8') top_language_name = cld.detect(text) lang_form_langid = langid.classify(text) if new_text == '': return True #if text empty - german if top_language_name[0] == 'GERMAN' or lang_form_langid[0] == 'de': return True else: return False
def spellcheck(self, text, tld=''): from scanner.models import BadWord # guess language code self.log.debug(' * guessing language...') #lang_code, lang_num, lang_name = guess_language.guessLanguageInfo(text) lang_name, lang_code, reliable, bytes_found, details = \ cld.detect(text.encode('utf-8'), hintTopLevelDomain=tld) self.log.debug(' -> detected lang: %s (%s)' % (lang_name, lang_code)) if lang_code.upper() == 'UNKNOWN' or lang_name.upper( ) == 'UNKNOWN' or not reliable: self.log.warning( ' -> Cannot detect language of page - end : %s' % details) return None, set() self.log.debug(' * searching for dictionary') try: checker = enchant.checker.SpellChecker( lang_code, filters=[ EmailFilter, URLFilter, # BetterURLFilter, ]) except enchant.DictNotFoundError: if lang_code in self.not_supported_lang: self.log.debug( " -> Cannot find language for spellchecker for %s - end (blacklisted)" % lang_code) else: self.log.error( " -> Cannot find language for spellchecker for %s - end" % lang_code) return None, set() # checking page for bad words self.log.debug(' * check spelling...') checker.set_text(text) self.log.debug(' -> ok') self.log.debug(' * get errors...') errors = [er.word for er in checker if len(er.word) < 128] self.log.debug(' -> ok') self.log.debug(' * found %d bad words and adding them to DB' % len(errors)) BadWord.objects.bulk_create( [BadWord(word=bad_word.strip().lower()) for bad_word in errors]) self.log.debug(' -> ok') self.log.debug(' * call filtering bad words') errors = BadWord.filter_bad_words(errors) self.log.debug(' -> ok') self.log.debug(' * after filtering out there is %d errors (%s)' % (len(errors), errors)) return lang_name, set(errors)
def detect_language(text): """ Detect the language of text using chromium_compact_language_detector :param text: text to be analyzed :return: {"name": portuguese, "pt"} """ name, code, isReliable, textBytesFound, details = cld.detect(text.encode('utf8')) return {"name": name, "code": code}
def filter(self, json_tweet_string): tweet = json.loads(json_tweet_string) # CLD expects a bytestring encoded as UTF-8, and not a unicode string tweet_text = codecs.encode(tweet['text'], 'utf-8') # Per the CLD docs, "isReliable is True if the top language is much better than 2nd best language." topLanguageName, topLanguageCode, isReliable, textBytesFound, details = cld.detect(tweet_text) if topLanguageName == "ENGLISH" and isReliable: return True else: return False
def __init__(self, status): self.status = status self.cld_result = None try: # topLanguageName, topLanguageCode, isReliable, textBytesFound, details self.cld_result = cld.detect(status.text.encode("ascii", "ignore"), isPlainText=True, includeExtendedLanguages=False) except UnicodeEncodeError, e: log.warn("language detection failed on %s" % repr(status.text))
def process(self, file_data): with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m: file_mime_type = m.id_buffer(file_data['contents']) metadata = {} if file_mime_type == 'text/plain': text = file_data['contents'] elif file_mime_type == 'text/html': text = parse_html(file_data['contents'], True, ['script', 'style']) elif file_mime_type == 'application/pdf': text, metadata = extract_pdf(file_data['contents']) else: # If we can't detect the mimetype we add a flag that can be read by # the frontend to provide more information on why the document # wasn't processed. # XXX: We're returning an empty text because if we don't the # pipeline will run indefinitely. The right approach is to make # pypelinin understand an specific exception (something like # StopPipeline) as a signal to stop processing this pipeline. return {'mimetype': 'unknown', 'text': "", 'file_metadata': {}, 'language': ""} text, forced_decoding = trial_decode(text) if isinstance(text, unicode): # HTMLParser only handles unicode objects. We can't pass the text # through it if we don't know the encoding, and it's possible we # also shouldn't. There's no way of knowing if it's a badly encoded # html or a binary blob that happens do have bytes that look liked # html entities. text = HTMLParser().unescape(text) text = clean(text) if isinstance(text, unicode): language = cld.detect(text.encode('utf-8'))[1] else: language = cld.detect(text)[1] return {'text': text, 'file_metadata': metadata, 'language': language, 'mimetype': file_mime_type, 'forced_decoding': forced_decoding}
def _language(si, context): if si.body and si.body.raw: name, code, is_reliable, num_text_bytes, details = cld.detect(si.body.raw) if is_reliable and code != "xxx": si.body.language = Language(code=code, name=name) else: si.body.language = Language(code="", name="") elif si.body: ## no .body.raw -- rare, but not impossible si.body.language = Language(code="", name="") return si
def runOne(self, expectedLangName, s): if VERBOSE: print print 'Test: %s [%d bytes]' % (expectedLangName, len(s)) detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect(s) if VERBOSE: print ' detected: %s' % detectedLangName print ' reliable: %s' % (isReliable != 0) print ' textBytes: %s' % textBytesFound print ' details: %s' % str(details) self.langsSeen.add(expectedLangName) print ' %d langs' % len(self.langsSeen) self.assertEquals(expectedLangName, detectedLangName) self.assertTrue(isReliable)
def process_line(tweet_line): line = tweet_line.rstrip() tweet_array = [splits for splits in line.split("\t") if splits is not ""] tweet_id = tweet_array[0] #Sun Jan 23 00:04:13 +0000 2011 tweet_date = tweet_array[1] tweet_content = tweet_array[4].lower() #aplicar cld para saber el idioma del tweet tweet_lang = cld.detect(tweet_content)[0] if tweet_lang == 'ENGLISH': process_english_tweet(tweet_content, tweet_date, tweet_id) return line
def get_desc_from_folder(folder_path, desc_count=1000): name_desc_pairs = {} count = desc_count for root, dirs, files in os.walk(folder_path): for file in files: if file.endswith(".json"): if len(name_desc_pairs) < count: # FIXME: unicode error \xc3\xa2\xc2\x99\xc2\xa3 desc = get_desc(os.path.join(root, file)) desc_utf8 = desc.encode('utf-8') if len(desc) > 1000: lang = cld.detect(desc_utf8) if lang[1] == 'en' and len(lang[4]) == 1: name_desc_pairs[file] = desc return name_desc_pairs
def handle_tweet(tweet): if 'langid_guess' not in tweet: tweet['langid_guess'] = langid.classify(tweet['text'])[0] if 'ldig_guess' not in tweet: tweet['ldig_guess'] = det.detect('model.latin', tweet['text'])[1] cld_guess = cld.detect(tweet['text'].encode('utf-8'))[1] guesses = {'langid_guess' : tweet['langid_guess'], 'ldig_guess' : tweet['ldig_guess'], 'twitter_guess' : tweet['lang'], 'cld_guess' : cld_guess} del tweet['langid_guess'] del tweet['ldig_guess'] del tweet['lang'] tweet['langs'] = guesses return tweet
def chromium_cld(page): ''' Run the Chromium Compact Language Detector on the given page. ''' # Python binding to C++ data = cld.detect(page) if data[4] == []: # this could happen when CLD returns 'unknown' result = {'und': 1} else: result = dict([(isoify(l[1]), l[2]/100.0) for l in data[4]]) return { 'data': data, 'result': result, 'best': isoify(data[1]), 'best_name': iso639_3_index[isoify(data[1])]['ref_name'] }
def process_tweet_xml_elm(tweet_xml_elm): tweet_text = etree.tostring(tweet_xml_elm.find("tweet_text"), method='text', encoding="UTF-8") tweet_text = remove_unicode_chr(smart_unicode(tweet_text)) tweet_id = tweet_xml_elm.find("tweet_id").text timestamp = tweet_xml_elm.find("timestamp").text created_at = tweet_xml_elm.find("created_at").text user_id = tweet_xml_elm.find("user_id").text langName, lang, isReliable, textBytesFound, details = cld.detect(tweet_text, pickSummaryLanguage=True, removeWeakMatches=False) #STRIP_NON_EN: if (STRIP_NON_EN and lang != 'en') or (STRIP_RT and tweet_text.upper().startswith("RT")): return lang,None return lang,Tweet(tweet_id,user_id,timestamp,created_at,tweet_text)
def process(input_file): count = 0 reader = unicode_csv_reader(open(input_file)) for status in reader: count = count + 1 status_text = status[_FIELDS.index("text")] tb = TextBlob(status_text) lang_detected = cld.detect(status_text.encode("utf-8")) # consider only english tweets if(lang_detected[0] == "ENGLISH"): sentiment = decode_sentiment(tb.sentiment[0]) print str(count), sentiment[1]," - " ,status_text.encode("utf-8") try: csvwriter.writerow([ status[_FIELDS.index("id")], status[_FIELDS.index("created_at")], status[_FIELDS.index("user_id")], status[_FIELDS.index("user_screen_name")], status[_FIELDS.index("user_name")].encode("utf-8"), status[_FIELDS.index("user_description")].encode("utf-8"), status[_FIELDS.index("user_created_at")], status[_FIELDS.index("user_followers_count")], status[_FIELDS.index("user_friends_count")], status[_FIELDS.index("user_statuses_count")], status[_FIELDS.index("user_listed_count")], status[_FIELDS.index("text")].encode("utf-8"), status[_FIELDS.index("source")].encode("utf-8"), status[_FIELDS.index("retweet_count")], status[_FIELDS.index("place_id")], status[_FIELDS.index("place_type")], status[_FIELDS.index("place_country_code")], status[_FIELDS.index("place_country")].encode("utf-8"), status[_FIELDS.index("place_name")].encode("utf-8"), status[_FIELDS.index("place_fullname")].encode("utf-8"), sentiment[1], sentiment[0] ]) except Exception as e: print "Exception ", e
def handleSingleLangdet_cld(cur, articleId, text, commit=True): """ Like handleSingleLangdet(), but using Google's CLD library. If `commit` is given, commits the transaction at the end. """ try: lc_alt = cld.detect(text.encode('utf8','replace'))[1] except: print repr(text) raise if lc_alt == 'un' or lc_alt == 'xxx': # "un" means "unknown" lc_alt = lc_iso = None else: lc_iso = iso_map.iso2to3[lc_alt.split('-')[0]] debug('Article %d has language (%s / %s)', articleId, lc_iso, lc_alt) cur.execute("UPDATE feed_article_meta SET lang_iso=%s, lang_altcode=%s, lang_is_cld=%s WHERE id=%s", (lc_iso, lc_alt, True, articleId)) if commit: cur.connection.commit() return (lc_iso, lc_alt)
def __call__(self, si, context): if si.body and si.body.raw: name, code, is_reliable, num_text_bytes, details = cld.detect(si.body.raw) if is_reliable and code != 'xxx': si.body.language = Language(code=code, name=name) else: si.body.language = Language(code='', name='') elif si.body: ## no .body.raw -- rare, but not impossible si.body.language = Language(code='', name='') if 'force' in self.config: si.body.language = Language( code=self.config['force'].get('code'), name=self.config['force'].get('name')) return si
def __call__(self, si, context): if si.body and si.body.raw: name, code, is_reliable, num_text_bytes, details = cld.detect( si.body.raw) if is_reliable and code != 'xxx': si.body.language = Language(code=code, name=name) else: si.body.language = Language(code='', name='') elif si.body: ## no .body.raw -- rare, but not impossible si.body.language = Language(code='', name='') if 'force' in self.config: si.body.language = Language(code=self.config['force'].get('code'), name=self.config['force'].get('name')) return si
def handleSingleLangdet_cld(cur, articleId, text, commit=True): """ Like handleSingleLangdet(), but using Google's CLD library. If `commit` is given, commits the transaction at the end. """ try: lc_alt = cld.detect(text.encode('utf8', 'replace'))[1] except: print repr(text) raise if lc_alt == 'un' or lc_alt == 'xxx': # "un" means "unknown" lc_alt = lc_iso = None else: lc_iso = iso_map.iso2to3[lc_alt.split('-')[0]] debug('Article %d has language (%s / %s)', articleId, lc_iso, lc_alt) cur.execute( "UPDATE feed_article_meta SET lang_iso=%s, lang_altcode=%s, lang_is_cld=%s WHERE id=%s", (lc_iso, lc_alt, True, articleId)) if commit: cur.connection.commit() return (lc_iso, lc_alt)
def clean(self,bid=None): super(Donation,self).clean() if self.domain == 'LOCAL': # local donations are always complete, duh if not self.donor: raise ValidationError('Local donations must have a donor') self.transactionstate = 'COMPLETED' if not self.donor and self.transactionstate != 'PENDING': raise ValidationError('Donation must have a donor when in a non-pending state') if not self.domainId and self.donor and self.timereceived: self.domainId = str(calendar.timegm(self.timereceived.timetuple())) + self.donor.email bids = set(self.bids.all()) # because non-saved bids will not have an id, they are not hashable, so we have to special case them if bid: if not bid.id: bids = list(bids) + [bid] else: #N.B. the order here is very important, as we want the new copy of bid to override the old one (if present) bids = list(set([bid]) | bids) bids = map(lambda b: b.amount,bids) bidtotal = reduce(lambda a,b: a+b,bids,Decimal('0')) if self.amount and bidtotal > self.amount: raise ValidationError('Bid total is greater than donation amount: %s > %s' % (bidtotal,self.amount)) tickets = self.tickets.all() ticketTotal = reduce(lambda a,b: a+b, map(lambda b: b.amount, tickets), Decimal('0')) if self.amount and ticketTotal > self.amount: raise ValidationError('Prize ticket total is greater than donation amount: %s > %s' % (ticketTotal,self.amount)) if self.comment and cld: if self.commentlanguage == 'un' or self.commentlanguage == None: detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect(self.comment.encode('utf-8'), hintLanguageCode ='en') if detectedLangCode in map(lambda x: x[0], LanguageChoices): self.commentlanguage = detectedLangCode else: self.commentlanguage = 'un' else: self.commentlanguage = 'un'
def process_tweet_xml_elm(tweet_xml_elm): tweet_text = etree.tostring(tweet_xml_elm.find("tweet_text"), method='text', encoding="UTF-8") tweet_text = remove_unicode_chr(smart_unicode(tweet_text)) tweet_id = tweet_xml_elm.find("tweet_id").text timestamp = tweet_xml_elm.find("timestamp").text created_at = tweet_xml_elm.find("created_at").text user_id = tweet_xml_elm.find("user_id").text langName, lang, isReliable, textBytesFound, details = cld.detect( tweet_text, pickSummaryLanguage=True, removeWeakMatches=False) #STRIP_NON_EN: if (STRIP_NON_EN and lang != 'en') or (STRIP_RT and tweet_text.upper().startswith("RT")): return lang, None return lang, Tweet(tweet_id, user_id, timestamp, created_at, tweet_text)
def langDetect(s): import cld langsSeen = set() detLangsSeen = set() detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect(s, pickSummaryLanguage=True, removeWeakMatches=False) if DEBUG: log('CLD :') log(' detected: %s' % detectedLangName) log(' reliable: %s' % (isReliable != 0)) log(' textBytes: %s' % textBytesFound) log(' details: %s' % str(details)) for tup in details: detLangsSeen.add(tup[0]) log(' %d langs; %d ever detected' % (len(langsSeen), len(detLangsSeen))) log("\n") if detectedLangName == 'Unknown': return 'Unknown' else: return [i[1] for i in details]
def filter(reader, out, meta_out): global counter_all counter_fi=0 counter=0 key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() #reader.sync(4042) position = reader.getPosition() while reader.next(key, value): k=key.toString() v=value.toString() position = reader.getPosition() #lang,w=langid.classify(v[:312]) lang = cld.detect(v[:400]) #iprint lang #if omor.is_fin_string(v): #pdb.set_trace() counter+=1 if lang[0]=='FINNISH': counter_fi+=1 out.write(magicStringB) out.write(k + '\n') out.write('<lang>\n' + str(lang) + '\n</lang>') out.write("\n") out.write(v) out.write(magicStringE) meta_out.write(k + '\n' + str(lang) + '\n') counter_all+=counter print >> sys.stderr, "Global counter", counter_all print >> sys.stderr, "Fin / All", counter_fi, counter
def runOne(self, expectedLangName, s, shouldBeReliable=True): if VERBOSE: print print 'Test: %s [%d bytes]' % (expectedLangName, len(s)) detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect(s, pickSummaryLanguage=True, removeWeakMatches=False) if VERBOSE: print ' detected: %s' % detectedLangName print ' reliable: %s' % (isReliable != 0) print ' textBytes: %s' % textBytesFound print ' details: %s' % str(details) self.langsSeen.add(expectedLangName) for tup in details: self.detLangsSeen.add(tup[0]) print ' %d langs; %d ever detected' % (len(self.langsSeen), len(self.detLangsSeen)) if False: if expectedLangName == 'YIDDISH': l = list(self.detLangsSeen) l.sort() for i, name in enumerate(l): print ' PyTuple_SET_ITEM(pyDetLangs, %d, PyString_FromString("%s"));' % (i, name) self.assertEquals(expectedLangName, detectedLangName, '%s != %s; details: %s' % (detectedLangName, expectedLangName, str(details))) self.assertTrue(not shouldBeReliable or isReliable)
import cld import os import os from fnmatch import fnmatch import sys #open the temp file java generated. with open (sys.argv[1], "r") as myfile: data=myfile.read().replace('\n', '') detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect(str(data), pickSummaryLanguage=False, removeWeakMatches=False) print ' lang: %s ,reliable: %i' % (detectedLangCode,isReliable)
def detect_articles_language(): for x in Article.objects.all().iterator(): x.language = cld.detect(x.excerpt.encode('ascii', 'ignore'))[1] x.save()
aids = defaultdict(list) # lang group -> list of article ids. done_hosts = set() for lg in LANG_GROUPS: f = open('./evaluation/compare_%s.html' % lg,'w') f.write('<style>table{table-layout:fixed}td{vertical-align:top;border:solid #ccc 1px;border-top:solid black 2px;}.tiny{font-size:70%; width:45%}</style>') f.write('<table><tr><th>Article info<th>diff(old Tomaz, new Tomaz)<th class="tiny">diff(Mitja, new Tomaz)') f.close() for i,row in enumerate(cur): url = row['url'] host = url.split('/')[2] if host in done_hosts: continue done_hosts.add(host) print i, row['id'], host lang = cld.detect(row['content'])[1].split('-')[0] if lang == 'un': continue elif lang == 'en': lang_group = 'english' elif lang in 'ja ko zh hi ms ml te ta jw oc ur gu th kn pa fa km'.split(): lang_group = 'syllabary' else: lang_group = 'alphabet' if len(aids[lang_group])>=50: continue aids[lang_group].append(row['id']) try: html = str(row['content']).decode('utf8') except: continue
from stools.wordcloud.wordcloud import make_wordcloud folder_path = "data_google_play" json_names = [['extendedInfo', 'description']] if not os.path.exists(folder_path): call(["git", "clone", "https://github.com/sangheestyle/data_google_play.git"]) json_contents, file_names = stio.read_json_folder(folder_path, '.json', json_names) descriptions = zip(*json_contents)[0] # The number of input files is 1365, but the number of result of apk_info # is 971 due to flitering by length of description and language (english only) apk_info = [] for idx, desc in enumerate(descriptions): if len(desc) > 1000: lang = cld.detect(desc.encode('utf-8')) if lang[1] == 'en' and len(lang[4]) == 1: apk_info.append([file_names[idx], desc.encode('ascii', errors='ignore')]) filtered_desc = zip(*apk_info)[1] stemmed_list = nlp.trs(filtered_desc, "snowball") dictionary = nlp.dictionary(stemmed_list) corpus = nlp.corpus(stemmed_list, dictionary) corpus_lda = nlp.lda(corpus, dictionary, num_topics=5) vectors = [array(f) for f in corpus_lda] prediction = ml.KMeans(vectors, n_clusters=5, max_iter=1000) for idx, item in enumerate(apk_info): apk_name = item[0] stemmed_content = ' '.join(stemmed_list[idx]) groupID = prediction[idx]
import cld conn, cur = openConnection() cur = conn.cursor('x') cur.execute( "SELECT m.id, p.content, m.lang_altcode FROM processed_article p JOIN feed_article_meta m ON (p.feed_articleid = m.id) WHERE p.mode='cleartext' ORDER BY m.id DESC LIMIT 100000" ) cnt = {} cnt2 = {} while True: row = cur.fetchone() if not row: break aid, txt, lang = row lang = str(lang[:2]) lang2 = cld.detect(txt.encode('utf8', 'ignore'))[1] cnt[lang] = cnt.get(lang, 0) + 1 cnt2[lang2] = cnt2.get(lang2, 0) + 1 print 'done', sum(cnt.itervalues()) print 'done' def top(d, n=60): for pair in sorted(d.iteritems(), key=lambda pair: -pair[1])[:n]: print '%s %5d' % pair print 'DATABASE SAYS:' top(cnt) print '\nCLD SAYS:'
def update_all(self): for course in Course.objects.filter(language__iexact="english"): topLanguageName, topLanguageCode, isReliable, textBytesFound, details = cld.detect(course.title.encode('utf-8') + " " + course.description.encode('utf-8')) if topLanguageCode not in ['en','un']: # force unidentified languages into English if topLanguageCode == 'un': topLanguageName = 'English' language = topLanguageName.lower().capitalize() if language in LANG_MAPPING: course.language = language course.save() continue else: self.stdout.write("New language %s %s (reliable: %s)" % (topLanguageName, topLanguageCode, isReliable))
def detect(self, utf8): name, code, reliable, numBytes, details = cld.detect(utf8, isPlainText=True, removeWeakMatches=False, pickSummaryLanguage=False) for tup in details: self.allLangs.add(tup[0]) return code, reliable
def get_language(text): """Returns the language of a given text as a tuple like (LANGUAGE, language-code)""" return cld.detect(text)[:2]
inputDB = "myDB" connector = sqlite3.connect(inputDB) selecter = connector.cursor() ########## #ALTER TABLE FOR A COLUMN FOR THE LANGUAGE selecter.execute('''ALTER TABLE myDB ADD COLUMN "description_abbreviation_chrome" 'TEXT' ''') selecter.execute('''ALTER TABLE myDB ADD COLUMN "description_language_chrome" 'TEXT' ''') selecter.execute('''ALTER TABLE myDB ADD COLUMN "description_reliability_chrome" 'TEXT' ''') ########## selecter.execute('''SELECT id_db, description FROM myDB''') for row in selecter: #print (row) a = ''.join(row[1]) goog = detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect("{}".format(a)) #chrome languagedetection lancode = detectedLangCode lanname = detectedLangName lanreliability = isReliable != 0 landetails = details #print ("lancode: ",lancode) #print ("lanname: ", lanname) #print ("lanreliability: ", lanreliability) print (row[0]) #print (a) connector.execute('''update myDB set description_abbreviation_chrome = ? , description_language_chrome = ? , description_reliability_chrome = ? where id_db == ?''', (lancode, lanname, lanreliability, row[0])) connector.commit() #save changes connector.close()
def get_full_article(this_item, feed_id, **kwargs): if len(this_item['link']) > 200: this_item['link'] = short.shorten(this_item['link'])['url'] if any(required not in this_item for required in ['title', 'link']): return try: Article.objects.values('id').get( Q(feed_id=feed_id, url=this_item['link']) | Q(feed_id=feed_id, title=this_item['title'])) return except Article.DoesNotExist: pass except Article.MultipleObjectsReturned: return published_parsed = datetime.utcnow().replace(tzinfo=utc) if 'updated_parsed' not in this_item: if 'published_parsed' in this_item and datetime.utcfromtimestamp( mktime(this_item['published_parsed'])).replace( tzinfo=utc) < published_parsed: published_parsed = datetime.utcfromtimestamp( mktime(this_item['published_parsed'])).replace(tzinfo=utc) elif datetime.utcfromtimestamp(mktime( this_item['updated_parsed'])).replace( tzinfo=utc) < published_parsed: published_parsed = datetime.utcfromtimestamp( mktime(this_item['updated_parsed'])).replace(tzinfo=utc) if 'author' not in this_item: this_item['author'] = None if 'description' not in this_item: this_item['description'] = '' if len(this_item['title']) > 200: this_item['title'] = this_item['title'][:180] + '...' res = process_article(this_item['description']) this_item['excerpt'] = res['excerpt'] this_item['word_count'] = res['word_count'] this_item['description'] = res['content'] media = res['image'] full = None if not media: if 'media_content' in this_item and 'url' in this_item[ 'media_content'][0]: media = this_item['media_content'][0]['url'] else: full = get_article_readability(this_item) if full: res = full media = res['lead_image_url'] if len(this_item['excerpt']) == 0: this_item['language'] = cld.detect(this_item['excerpt'].encode( 'ascii', 'ignore'))[1] if this_item['language'] == 'un': this_item['language'] = cld.detect(this_item['title'].encode( 'ascii', 'ignore'))[1] else: this_item['language'] = cld.detect(this_item['title'].encode( 'ascii', 'ignore'))[1] if kwargs.get('summarize_excerpt'): extend = {'content_ex': None, 'summary': None} try: with tempfile.NamedTemporaryFile() as tmp: tmp_path = tmp.name tmp.write( (strip_tags(res['content'].decode('ascii', 'ignore'))).strip()) tmp.flush() extend['summary'] = subprocess.check_output( ['ots', tmp_path]).strip().splitlines().pop().strip() except Exception: pass else: extend = extend_article(full, this_item['link'], **kwargs) obj, created = Article.objects.get_or_create(feed_id=feed_id, url=this_item['link'], defaults={ 'title': this_item['title'], 'content': this_item['description'], 'word_count': res['word_count'], 'url': this_item['link'], 'media': media, 'date_parsed': published_parsed, 'author': this_item['author'], 'excerpt': this_item['excerpt'], 'language': this_item['language'], 'summary': extend['summary'], 'content_ex': extend['content_ex'] }) if created: get_article_info(obj)
annotations_table, spotlight_table = sql_convenience.create_all_tables(args.keyword) tweets = tweet_generators.get_tweets(open(args.tweet_file)) # we can skip through Tweets we've already seen in the same file by # specifying a tweet id to jump to if args.skipto is not None: for tweet in tweets: if tweet['id'] == args.skipto: break # continue after this tweet for tweet in tweets: tweet_text = unicode(tweet['text']) annotate = True # determine if this is an English tweet or not tweet_text_bytesutf8 = tweet_text.encode('utf-8') language_name, language_code, is_reliable, text_bytes_found, details = cld.detect(tweet_text_bytesutf8) # example: ('SPANISH', 'es', True, 69, [('SPANISH', 'es', 100, 93.45794392523365)]) print("---") print(language_name, language_code, is_reliable) if language_code not in set(["en", "un"]): annotate = False tweet_id = tweet['id'] if sql_convenience.check_if_tweet_exists(tweet_id, annotations_table) == 0: # check our keyword is present as Twitter can provide tweets 'relevant # to your keyword' which don't actually contain the keyword (but it # might be linked in a t.co title or body text) nbr_keywords = tweet_text.lower().count(args.keyword) nbr_keywords_hash = tweet_text.lower().count("#" + args.keyword) print(nbr_keywords, nbr_keywords_hash) if nbr_keywords == nbr_keywords_hash:
def clean(self, bid=None): super(Donation, self).clean() if self.domain == 'LOCAL': # local donations are always complete, duh if not self.donor: raise ValidationError('Local donations must have a donor') self.transacationstate = 'COMPLETED' if not self.donor and self.transactionstate != 'PENDING': raise ValidationError( 'Donation must have a donor when in a non-pending state') if not self.domainId and self.donor and self.timereceived: self.domainId = str(calendar.timegm( self.timereceived.timetuple())) + self.donor.email bids = set(self.bids.all()) # because non-saved bids will not have an id, they are not hashable, so we have to special case them if bid: if not bid.id: bids = list(bids) + [bid] else: #N.B. the order here is very important, as we want the new copy of bid to override the old one (if present) bids = list(set([bid]) | bids) bids = map(lambda b: b.amount, bids) bidtotal = reduce(lambda a, b: a + b, bids, Decimal('0')) if self.amount and bidtotal > self.amount: raise ValidationError( 'Bid total is greater than donation amount: %s > %s' % (bidtotal, self.amount)) tickets = self.tickets.all() ticketTotal = reduce(lambda a, b: a + b, map(lambda b: b.amount, tickets), Decimal('0')) if self.amount and ticketTotal > self.amount: raise ValidationError( 'Prize ticket total is greater than donation amount: %s > %s' % (ticketTotal, self.amount)) if self.comment and cld: if self.commentlanguage == 'un' or self.commentlanguage == None: detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect( self.comment.encode('utf-8'), hintLanguageCode='en') if detectedLangCode in map(lambda x: x[0], LanguageChoices): self.commentlanguage = detectedLangCode else: self.commentlanguage = 'un' else: self.commentlanguage = 'un'
from collections import Counter import cld, tweetstream if __name__ == "__main__": langs = Counter() while True: try: with tweetstream.FilterStream("jcodameta", "T3stcct!", track=["MediaLabIO"]) as stream: for tweet in stream: tweet_text = tweet.get("text") try: lang = cld.detect(tweet_text.encode('utf-8'))[0] langs[lang] += 1 print langs, tweet_text except: print "error1" continue except: print "error2" continue
import scraperwiki import cld text = [] text.append('hello over there, i am from Moscow') text.append('Привет всем, я алкоголик из Москвы') text.append('Hola a todos, soy un alcohólico de Moscú') topLanguageName = [] lngCode = [] for x in range(len(text)): topLanguageName.append(cld.detect(text[x])[0]) lngCode.append(cld.detect(text[x])[1]) print topLanguageName[x] import scraperwiki import cld text = [] text.append('hello over there, i am from Moscow') text.append('Привет всем, я алкоголик из Москвы') text.append('Hola a todos, soy un alcohólico de Moscú') topLanguageName = [] lngCode = [] for x in range(len(text)): topLanguageName.append(cld.detect(text[x])[0])
import cld text_it = "Wales lancia la Wikipedia delle news. Contro il fake in campo anche Google" text_en = "Cassini Spacecraft Re-Establishes Contact After 'Dive' Between Saturn And Its Rings" lang_it = cld.detect(text_it) lang_en = cld.detect(text_en) print(text_it, "is in", lang_it) print(text_en, "is in", lang_en)
database = "jb", user = MYSQL_USER_NAME, password = MYSQL_PASSWORD ) db2 = Connection(host = HOST_NAME, database = "jb_pure", user = "******", password = "******" ) tweets = db.query("SELECT * FROM tweets") count = 0 for t in tweets: c = t["tweet"].encode('utf-8') # language detection name, code, reliable, bytes_found, details = cld.detect(c) # compile a regex for urls. We don't want tweets with urls r = re.compile(r"(http://[^ ]+)") urlmatch = r.search(c) # we use a set to save tweets, and check against that to prevent duplicates saved = set() if (code == "en" or code == "un") and not urlmatch and c not in saved: # we allow 'unknown' languages into our database, as these are mostly short singlish sentences db2.execute("INSERT INTO tweets (user, tweet, location) VALUES (%s, %s, %s)", t["user"], c, t["location"]) saved.add(c) else: print "Not English: " + c + " lang:" + name count = count + 1 db.close() db2.close()
def get_full_article(this_item, feed_id, **kwargs): if len(this_item['link']) > 200: this_item['link'] = short.shorten(this_item['link'])['url'] if any(required not in this_item for required in ['title', 'link']): return try: Article.objects.values('id').get(Q(feed_id=feed_id, url=this_item['link']) | Q(feed_id=feed_id, title=this_item['title'])) return except Article.DoesNotExist: pass except Article.MultipleObjectsReturned: return published_parsed = datetime.utcnow().replace(tzinfo=utc) if 'updated_parsed' not in this_item: if 'published_parsed' in this_item and datetime.utcfromtimestamp( mktime(this_item['published_parsed'])).replace(tzinfo=utc) < published_parsed: published_parsed = datetime.utcfromtimestamp(mktime(this_item['published_parsed'])).replace(tzinfo=utc) elif datetime.utcfromtimestamp(mktime(this_item['updated_parsed'])).replace(tzinfo=utc) < published_parsed: published_parsed = datetime.utcfromtimestamp(mktime(this_item['updated_parsed'])).replace(tzinfo=utc) if 'author' not in this_item: this_item['author'] = None if 'description' not in this_item: this_item['description'] = '' if len(this_item['title']) > 200: this_item['title'] = this_item['title'][:180] + '...' res = process_article(this_item['description']) this_item['excerpt'] = res['excerpt'] this_item['word_count'] = res['word_count'] this_item['description'] = res['content'] media = res['image'] full = None if not media: if 'media_content' in this_item and 'url' in this_item['media_content'][0]: media = this_item['media_content'][0]['url'] else: full = get_article_readability(this_item) if full: res = full media = res['lead_image_url'] if len(this_item['excerpt']) == 0: this_item['language'] = cld.detect(this_item['excerpt'].encode('ascii', 'ignore'))[1] if this_item['language'] == 'un': this_item['language'] = cld.detect(this_item['title'].encode('ascii', 'ignore'))[1] else: this_item['language'] = cld.detect(this_item['title'].encode('ascii', 'ignore'))[1] if kwargs.get('summarize_excerpt'): extend = { 'content_ex': None, 'summary': None } try: with tempfile.NamedTemporaryFile() as tmp: tmp_path = tmp.name tmp.write((strip_tags(res['content'].decode('ascii', 'ignore'))).strip()) tmp.flush() extend['summary'] = subprocess.check_output(['ots', tmp_path]).strip().splitlines().pop().strip() except Exception: pass else: extend = extend_article(full, this_item['link'], **kwargs) obj, created = Article.objects.get_or_create( feed_id=feed_id, url=this_item['link'], defaults={'title': this_item['title'], 'content': this_item['description'], 'word_count': res['word_count'], 'url': this_item['link'], 'media': media, 'date_parsed': published_parsed, 'author': this_item['author'], 'excerpt': this_item['excerpt'], 'language': this_item['language'], 'summary': extend['summary'], 'content_ex': extend['content_ex']}) if created: get_article_info(obj)