def main(): s=sys.stdin.readlines() test_content="" if len(s)<10: test_content=" ".join(s) else: test_content=" ".join(s[:10]) print langid.classify(test_content)[0]
def process_tu(self, tu, num_of_finished_scans): src_lang = langid.classify(tu.src_phrase)[0] trg_lang = langid.classify(tu.trg_phrase)[0] if src_lang != self.src_language and src_lang not in self.src_language: return [0] if trg_lang != self.trg_language and trg_lang not in self.trg_language: return [0] return [1]
def detect_language(filename): with codecs.open(filename=filename, mode="r", encoding="utf-8") as fd: while True: line = fd.readline().strip("\n") print line if not line: break title = line.split(':')[1] print langid.classify(title), ":", title
def decide(self, tu): src_lang = langid.classify(tu.src_phrase)[0] trg_lang = langid.classify(tu.trg_phrase)[0] if src_lang != self.src_language and src_lang not in self.src_language: return 'reject' if trg_lang != self.trg_language and trg_lang not in self.trg_language: return 'reject' return 'accept'
def fix_encoding(expected_langs, text, data): detected_lang, _confidence = langid.classify(text) if detected_lang not in expected_langs: enc = chardet.detect(text.encode("raw_unicode_escape")) sys.stderr.write(str(enc)) if enc["encoding"] != "ascii": fixed_text = text.encode("raw_unicode_escape").decode(enc["encoding"]) detected_lang, _confidence = langid.classify(fixed_text) if detected_lang in expected_langs: fixed_data = data.decode("utf-8").encode("raw_unicode_escape").decode(enc["encoding"]).encode("utf-8") return fixed_data return data
def init_languages(): # This takes some time to load: global language_support # Imported modules are just variables - names binded to some values. So all you need is to import them and make them global with global keyword. global langid import langid # I have to do that for the library to initialize, which takes some time langid.classify("test") language_support = True print "Language identification support loaded"
def remove_punct_noneng(df): # punctuation removals on @, # and all the symbols punctuation_remove = string.punctuation punctuation_remove = punctuation_remove.replace('@', '') punctuation_remove = punctuation_remove.replace('#', '') df['text'] = df['text'].str.replace('[{}]'.format(punctuation_remove), '') list_to_remove = ["\r", "\n", "–", "“", "”", "…", "‘", "’", "•"] df['text'] = [re.sub(r"#\w+", "", str(x)) for x in df['text']] df['text'] = [re.sub(r"@\w+", "", str(x)) for x in df['text']] df['text'] = [re.sub("—", " ", str(x)) for x in df['text']] #replace - with space df["text"] = [re.sub('\s+', ' ', str(x)) for x in df["text"] ] #remove more than 2 consec spaces with just one space for elem in list_to_remove: df["text"] = df["text"].str.replace(elem, "") df["text"] = df["text"].str.lower() # remove all rows with foreign language characters for index, row in df.iterrows(): text = row['text'] # check for null text empty = text is np.nan or text != text if not empty: if len(text) >= 3: lang, _ = langid.classify(text) if lang != "en": df.drop(index, inplace=True) return (df)
def ask(search: str): interim = elasticsearch_client.search(index=DB_INDEX_AUTOCOMPLETE, body= { '_source':['text'], 'query':{ "bool": { "must": [{ "match": { "text": search } }, { "exists": { "field": "count" } }] } }, 'size': 10, 'sort' :[ {'count' : {'order' : 'desc' }} ] }) resultCount = len(interim['hits']['hits']) result = [] for i in range(resultCount): result.append(interim['hits']['hits'][i]['_source']['text']) lang, score = langid.classify(search) return { "results":result, "language": lang }
def score( self, hypothesis: List[str], references: List[List[str]], tags: Optional[List[List[str]]] = None ) -> VizSeqScore: corpus_score, sent_scores, group_scores = None, None, None import bert_score as bs import langid import logging logging.getLogger('pytorch_pretrained_bert').setLevel(logging.WARNING) logging.getLogger('langid').setLevel(logging.WARNING) lang = langid.classify(references[0][0])[0] sent_scores = bs.score( hypothesis, references[0], nthreads=self.n_workers, lang=lang, verbose=self.verbose )[2].tolist() if self.corpus_level: corpus_score = np.mean(sent_scores) if tags is not None: tag_set = self._unique(tags) group_scores = {} for t in tag_set: indices = [i for i, cur in enumerate(tags) if t in cur] group_scores[t] = np.mean([sent_scores[i] for i in indices]) return VizSeqScore.make( corpus_score=corpus_score, sent_scores=sent_scores, group_scores=group_scores )
def check_lang(data_str): predict_lang = langid.classify(data_str) if predict_lang[1] >= .9: language = predict_lang[0] else: language = 'NA' return language
def user_langid(candidate): print 'Performing langid by user' conn=sqlite3.connect(config.DB) c=conn.cursor() users=set() user_text={} from langid import classify c.execute('SELECT user,tweet FROM tweets') count=0 for user,status in c.fetchall(): count+=1 if count%100000==0: print 'Processed:',count if user not in candidate: continue status=pickle.loads(str(status)) if user not in user_text: user_text[user]=[] user_text[user].append(space_re.sub(' ',remove_specific_re.sub(' ',status.text)).strip()) count=0 lang_distr={} for user in user_text: count+=1 if count%1000==0: print 'Users:',count,'/',len(user_text) lang=classify(' '.join(user_text[user]))[0] if lang in config.LANGS: users.add(user) lang_distr[lang]=lang_distr.get(lang,0)+1 print len(users) print sorted(lang_distr.items(),key=lambda x:-x[1]) conn.close() print 'Users:',len(users),'/',len(user_text) return users
def movie_review_emotions(doc, **kwargs): """Emotion (fine-grained sentiment) tagger for movie reviews. The training data for this function is that of Buitinck et al., with the training and test data concatenated. The algorithm is SVMs in a binary relevance (one-vs-rest) combination. You may use the training data (and this function) for academic/research purposes only. Add a parameter for_academic_research=True if you accept the license. Returns ------- tagged : list of (string, list of string) A list of (sentence, labels) pairs. Each sentence may have zero or more labels. References ---------- L. Buitinck, J. van Amerongen, E. Tan and M. de Rijke (2015). Multi-emotion detection in user-generated reviews. Proc. ECIR. https://staff.fnwi.uva.nl/m.derijke/wp-content/papercite-data/pdf/buitinck-multi-emotion-2015.pdf """ if not (kwargs.get("for_academic_research", False) or kwargs.get("unittest", False)): raise RuntimeError( "This functionality is only available for" " academic research. Please use movie_review_emotions(doc," " for_academic_research=True) to use this function for" " that purpose." ) from ._emotion import classify nltk_download("punkt") sentences = pipe(doc, fetch, nltk.sent_tokenize) return list(zip(sentences, classify(sentences)))
def filter_language(self, domain): filter_lang_set= set(["en", "es"]) if langid.classify(domain)[0] in filter_lang_set: return True return False
def is_english(text): """ if the html text is english """ lang = langid.classify(text) if lang and 'en' in lang[0]: return True return False
def format_tweets(): f = open("../twitter_scraper/28_04_2014_scrape.csv", "r") w = open("parsed_tweets_scraped.csv", "w") while 1: file_content = f.readlines(10000) if not file_content: break file_content = process_file_content(file_content) non_dup = list(set(file_content)) for line in non_dup: tup = langid.classify(line) if "en" in tup: if "diabetes" in line: line = line.replace("'", "").strip() line = line.replace('"', "") if not line.startswith('"RT'): if not "http" in line and not "https" in line: w.write(line + "\n") else: line = re.sub( r"""(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""", "", line, ) w.write(line + "\n") f.close() w.close()
def _get_date_info(key_minutes: str, date_line: str, verbose: bool = False) -> dict: funcs = get_parse_functions(langid.classify(date_line)[0]) result = {} _ = extract_time(date_line) result[key_minutes] = _[0] if _ else None if verbose: print('') for fun in funcs: try: if verbose: print("Parse %s: %s" % (fun.__name__, fun(date_line)[0])) _ = fun(date_line)[0] _1 = datetime.now() if str(_) == _1.strftime("%Y-%m-%d %H:%M:%S"): continue if isinstance(_, datetime) and _.year > _1.year: continue result[fun.__name__] = _ except AttributeError as e: # if verbose: # print("Run '%s', error - %s" % (fun.__name__, e)) pass return result
def process(web_resource): text = web_resource.get_text() if text: (lang, confidence) = langid.classify(text) if confidence >= 0.9: return web_resource.url, lang return web_resource.url, 'unknown'
def __init__(self, filename): """ Parameters ---------- filename: str Path to the (plaintext) file for this document. """ # open file with utf-8-sig to remove any BOMs with open(filename, "r", encoding="utf-8-sig", errors="ignore") as infile: self.string = clean_whitespace(infile.read()) self.lang = langid.classify(self.string)[0] if self.lang == 'de': self._stemmer = GermanStemmer() elif self.lang == 'en': self._stemmer = EnglishStemmer() else: print("no stemmer for '{}'".format(self.lang)) print("falling back to 'de'...") self._stemmer = GermanStemmer() self.name = os.path.splitext(os.path.split(filename)[1])[0] self.ID = self.name self.tokens = word_tokenize(self.string) self.stems = list(map(self.stem, self.tokens)) self.length = len(self.tokens) self.hashes = list(map(hash, self.stems)) self.sents = self._get_sents() self.freq_dist = dict(Counter(self.hashes))
def _guess_language_of_text(text): """ Guess the language of a string. Returns a dict of type multilanguage text, if possible with the correct language attribution to the text. """ probables = ["en", "de", "fr"] text = str(text) predefined = _get_predefined_multilanguage_string(text) if predefined: return predefined try: lang_txtblob = TextBlob(text).detect_language() if lang_txtblob in probables: return {lang_txtblob: text} except Exception: lang_txtblob = None try: guess = guess_language.guess_language(text, probables) if guess in probables: return {guess: text} if not isinstance(guess, str): guess = None except Exception as e: guess = None try: la_id, _ = langid.classify(text) det = detect(text) if det and la_id and det == la_id and det in probables: return {det: text} except Exception as e: det = None return {"XX": text}
def compute_sentiment(self, message): if message.language == 'en': # helps preventing mis-identified english language message.language = langid.classify(message.message)[0] native_language = iso639_1_to_native.get(message.language, 'english') \ if hasattr(message, 'language') else 'english' iso_639_2 = iso639_1_to_2.get(message.language, 'eng') \ if hasattr(message, 'language') else 'eng' words = self.nltk.tweet_tokenize(message.message.encode('utf8'), True, True, True) # print "native language is", native_language no_stopwords = self.nltk.remove_stopwords(words, native_language) word_sentiments = [ senti for senti in map( lambda word: self.compute_word_sentiment(word, iso_639_2), no_stopwords) if senti is not None ] return { 'pos': round( sum(map(lambda s: s['pos'], word_sentiments)) / len(word_sentiments), 3), 'neg': round( sum(map(lambda s: s['neg'], word_sentiments)) / len(word_sentiments), 3), 'obj': round( sum(map(lambda s: s['obj'], word_sentiments)) / len(word_sentiments), 3) } if any(word_sentiments) else None
def langsplit(filename, text): cmd = [ "/home/buck/net/build/mtma_bitext//html_convert/langsplit", "--printchunks"] proc = Popen(cmd, stdin=PIPE, stdout=PIPE) tld = filename.split("/")[0].split(".")[0] header = "%s tld:%s uri:%s\n" % (magic_numer, tld, filename) proc.stdin.write(header) proc.stdin.write(text.encode("utf-8")) proc.stdin.write("\n") output = proc.communicate()[0] if not output.strip(): # sys.stderr.write("writing debug file.\n") # f = open("debug", "w") # f.write(header) # f.write(text.encode("utf-8")) # f.close() res = langid.classify(text) lang = res[0] header = "%s\tlanguage:%s\tbytes:%d\n" % (header.rstrip(), lang, len(text.encode("utf-8"))) return header + text return output
def language_processing(): path = '/Users/ze/Documents/PycharmProjects/Data/Instagram/' # MacBookPro df = pd.read_csv(path + 'comment.csv', nrows=10) df_text = df['text'].str.replace('[’·°–!"#$%&\'()*+,' '-./:;<=>?@,。?★、…【】()《》?“”‘’![\\]^_`{|}~]+', " ", regex=True) #df_text = df_text.apply(lambda x: ' '.join([w for w in x.split() if len(w)>3])) df_text = df_text.apply(lambda x: x.lower()) df_language_1 = pd.DataFrame() df_language_2 = pd.DataFrame() df_language_openai = pd.DataFrame() for index, item_iter in df_text.iteritems(): #df_text.iloc[index] = deEmojify(item_iter) try: lg = detect(item_iter) except: lg = 'unknown' df_language_1.loc[index, 0] = lg try: lg = langid.classify(item_iter) except: lg = 'unknown' df_language_2.loc[index, 0] = lg[0] lg = detect_language(item_iter) df_language_openai.loc[index, 0] = lg['answers'][0] df_text = pd.concat([df_text, df_language_1, df_language_2, df_language_openai], axis='columns', sort=False) df_text.columns = (['text', 'langdetect', 'langid', 'openai'])
def googlebooks_scr(parsed_url, date_format='%Y-%m-%d') -> tuple: """Create the response namedtuple.""" parsed_query = parse_qs(parsed_url.query) id_ = parsed_query.get('id') if id_ is not None: volume_id = id_[0] else: # the new URL format volume_id = parsed_url.path.rpartition('/')[2] dictionary = ris_parse( request( f'https://{parsed_url.netloc}/books/download/?id={volume_id}' f'&output=ris', spoof=True).content.decode('utf8')) dictionary['date_format'] = date_format # manually adding page number to dictionary: pg = parsed_query.get('pg') if pg is not None: pg0 = pg[0] dictionary['page'] = pg0[2:] dictionary['url'] += f'&pg={pg0}' # although google does not provide a language field: if not dictionary['language']: dictionary['language'] = classify(dictionary['title'])[0] return dict_to_sfn_cit_ref(dictionary)
def detect_lang(text1: str, checklen: int = 3000) -> str: ''' Detect Chinese and other languages. return a string lang for use in seg_sent(text, lang=lang) ''' if not text1: return 'english' text0 = text1[:checklen] detected = 'en' try: detected = langid.classify(text0)[0] except Exception as exc: LOGGER.warning(" langid.classify failed: %s, falls back eo english", exc) # if detected not in ['zh', 'en', 'fr', 'it', 'de', 'pt', 'es']: detected = 'en' # lang_dict[detected] try: lang = pycountry.languages.get(alpha_2=detected).name.lower() except Exception as exc: LOGGER.warning('Unable to detect, %s, setting to english', exc) lang = 'english' return lang
def filter_language(docs, attribute, lang='en', filter_empty=True): """ Filter document collection on language. Parameters ---------- docs : Sequence[epo_utils.documents.ExchangeDocument] Documents to filter. attribute : str Text attribute to filter on. lang : str Language code, default "en" filter_empty : bool If True, empty strings will be filtered as well. Yields ------ api.ops.documents.ExchangeDocument """ for doc in docs: text = getattr(doc, attribute) if filter_empty and not text: continue elif text: if langid.classify(text) == lang: yield doc elif not text: yield doc
def tag_lang_pair(cls, src: str, ref: Optional[str]) -> List[str]: machine_tags = [] src_lang = langid.classify(src)[0] ref_lang = None if ref is None else langid.classify(ref)[0] if ref is not None and src_lang == ref_lang: machine_tags.append(f'lang: {ref_lang}') else: machine_tags.append(f'src_lang: {src_lang}') if ref is not None: machine_tags.append(f'trg_lang: {ref_lang}') if ref_lang is not None \ and ref_lang in cls.POTENTIAL_UNSEGMENTED_LANGUAGES \ and ref.find(' ') == -1: machine_tags.append('unsegmented_trg') return machine_tags
def translate(self, inputFile, outputFile, lik, ssize): try: fin = inputFile.decode('utf-8') except Exception: pass lineTuple = langid.classify(inputFile) #调用langid来对该行进行语言检测 if lineTuple[0] in lik[0] or lineTuple[ 0] in self.countrylist: #如果该行语言大部分为中文 if lineTuple[0] not in lik[0]: countr = lik[1] outurl = outputFile.split('/') outurl[-3] = countr outurlstr = '/'.join(outurl[:-3]) sitesize = PathSize().GetPathSize(outurlstr) # M if float(sitesize) >= float(ssize): return True outurlFile = '/'.join(outurl) p = re.compile(r'[\n]+') with codecs.open(outputFile, 'w', "utf-8") as fout: #以写的方式打开输出文件 try: fout.writelines(p.sub('\n', inputFile)) except Exception: fout.writelines(p.sub('\n', fin)) return True else: logger.error('文件内容的语言%s和想获取的文章的语言%s不符合!' % (lineTuple[0], ','.join(lik[0]))) return False
def langCheck(us): # filter out names in the format of Axx Bxx. p = re.compile("[A-Z][a-z]* [A-Z][a-z]*") clean_us = p.sub("", us) engFlag = 0 chiFlag = 0 for ch in clean_us: if u'\u3040' <= ch <= u'\u309f' or \ u'\u30a0' <= ch <= u'\u30ff': return 3 if u'A' <= ch <= u'Z' or u'a' <= ch <= u'z': engFlag = 1 if u'\u4e00' <= ch <= u'\u9fff' or \ u'\uac00' <= ch <= u'\ud7af': chiFlag = 1 if engFlag == 1 and chiFlag == 1: break # return 0 for English, 1 for none-English, # 2 for both, 3 for other (Japanese & Spanish) if engFlag == 1 and chiFlag == 1: return 2 elif chiFlag == 1: return 1 else: if langid.classify(us)[0] == 'es': print us return 3 else: return 0
def check_german(tweet_text): emoji_key = pandas.read_csv('DATA/emoji_table.txt', encoding='utf-8', index_col=0) emoji_key['count'] = 0 emoji_dict = emoji_key['count'].to_dict() emoji_dict = emoji_key['count'].to_dict() emoji_dict_total = emoji_key['count'].to_dict() emoji_list = emoji_dict.keys() tweet_text = unicode(tweet_text, 'utf-8') tweet_text = tweet_text.encode('utf-8') tokens = tweet_text.split(' ') new_text = '' #delete @username for token in tokens: if '@' not in token: new_text += token + ' ' new_text = new_text.lower() text = unicode(new_text, 'utf-8') text = text.encode('utf-8') top_language_name = cld.detect(text) lang_form_langid = langid.classify(text) if new_text == '': return True #if text emty - german if top_language_name[0] == 'GERMAN' or lang_form_langid[0] == 'de': return True else: return False
def detectLanguage(df: pd.DataFrame): """ Given a DataFrame of conversations grouped, detect the language of texts by using langdetect and langid. Use a rule to compare their results and make the decision. :param df: the DataFrame where a single row consists of only a conversation :return: two lists, one stores all conversations in English while the other stores those in other languages """ print("\nDetecting languages...") # langdetect (https://github.com/Mimino666/langdetect) works better on long texts, which supports: # af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he, # hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl, # pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi, zh - cn, zh - tw df['lang1'] = df.apply(lambda row: langdetect.detect(row['body']), axis=1) # langid (https://github.com/saffsd/langid.py) works better on short texts, # and performs way better when a consideration set is given, which supports: # af, am, an, ar, as, az, be, bg, bn, br, bs, ca, cs, cy, da, de, dz, el, en, eo, es, et, eu, fa, fi, fo, # fr, ga, gl, gu, he, hi, hr, ht, hu, hy, id, is, it, ja, jv, ka, kk, km, kn, ko, ku, ky, la, lb, lo, lt, # lv, mg, mk, ml, mn, mr, ms, mt, nb, ne, nl, nn, no, oc, or, pa, pl, ps, pt, qu, ro, ru, rw, se, si, sk, # sl, sq, sr, sv, sw, ta, te, th, tl, tr, ug, uk, ur, vi, vo, wa, xh, zh, zu df['lang2'] = df.apply(lambda row: langid.classify(row['body'])[0], axis=1) nonEN = [] # 'lang1' and 'lang2' are now 2 new columns at right hand for reset_index, row in df.iterrows(): # define the criteria here # since 'lang2' tends to be more reliable, so the rule checks the result by 'lang2' first # and see whether it is consistent with that given by 'lang1' if row['lang2'] != 'en' and row['lang1'] == row['lang2']: nonEN.append(row['conversationID']) print("There are " + str(len(nonEN)) + " chats in other languages.") # separate the DataFrame into two, the English and non-English chatsEN = df[df.apply(lambda row: row['conversationID'] not in nonEN, axis=1)] chatsNonEN = df[df.apply(lambda row: row['conversationID'] in nonEN, axis=1)] return chatsEN, chatsNonEN
def process(self, instance, verbose=True, commit=True, **kwargs): """ See source code. """ instance_name = instance._meta.verbose_name instance_id = instance.id try: language, probability = langid.classify(instance.content) except: LOGGER.exception(u'Exception while detecting language of %s %s', instance_name, instance_id) return if language: if probability > 0.8: instance.language = models.Language.get_by_code(language) if verbose: LOGGER.info(u'language-detector: set %s %s language to %s ' u'(confidence: %s).', instance_name, instance_id, instance.language, probability) if commit: instance.save() else: LOGGER.warning(u'language-detector: Confidence too low (%s) ' u'to set a language on %s %s based on its ' u'content.', probability, instance_name, instance_id) else: LOGGER.warning(u'language-detector: No language detected in %s %s ' u'content.', instance_name, instance_id)
def check_word(word, article_url, word_context): time.sleep(1) print(word) client.captureMessage("API Checking Word", extra={ 'word': word, }) if not check_api(word): client.captureMessage("API Rejection", extra={ 'word': word, 'word_context': word_context, }) return language, confidence = langid.classify(word_context) if language != 'en': client.captureMessage("Language Rejection", extra={ 'word': word, 'word_context': word_context, 'confidence': confidence }) return if int(r.get("recently") or 0) < 8: r.incr("recently") r.expire("recently", 60 * 30) tweet_word(word, article_url, word_context) else: client.captureMessage("Recency Rejection", extra={'word': word})
def langsplit(filename, text): cmd = [ "/home/buck/net/build/mtma_bitext//html_convert/langsplit", "--printchunks" ] proc = Popen(cmd, stdin=PIPE, stdout=PIPE) tld = filename.split("/")[0].split(".")[0] header = "%s tld:%s uri:%s\n" % (magic_numer, tld, filename) proc.stdin.write(header) proc.stdin.write(text.encode("utf-8")) proc.stdin.write("\n") output = proc.communicate()[0] if not output.strip(): # sys.stderr.write("writing debug file.\n") # f = open("debug", "w") # f.write(header) # f.write(text.encode("utf-8")) # f.close() res = langid.classify(text) lang = res[0] header = "%s\tlanguage:%s\tbytes:%d\n" % (header.rstrip(), lang, len(text.encode("utf-8"))) return header + text return output
def __extractEnglishSentences(self, rawtext): """Function to extract the English sentences in a string :param rawtext: string that we want to clean """ sentences = sent_tokenize(rawtext, 'english') return ' '.join( [el for el in sentences if langid.classify(el)[0] == 'en'])
def on_data(self, data): try: tweet = data.split(',"text":"')[1].split('","source')[0] tweet = re.sub(r'\shttps?:(.+)', '', tweet) lang = langid.classify(tweet)[0] if lang == 'en': created = int(time.time()) created_adj = round_ten_min(created) clean_tweet = re.sub("^rt\s@.+?:\s", "", tweet.lower()) clean_tweet = re.sub("[^\w\s]", "", clean_tweet) sent = nb.predict(cv.transform([clean_tweet]))[0] prob = nb.predict_proba( cv.transform([rem_stop_words(clean_tweet)]))[0][0] cur.execute("INSERT INTO mctweets (time, time_adj, text, text_cleaned, \ sents, prob) VALUES (%s, %s, %s, %s, %s, %s)" , (created, created_adj, \ tweet, clean_tweet, sent, prob)) cur.execute("DELETE FROM mctweets WHERE time < %s;", [created - 90001]) conn.commit() return True except BaseException, e: print 'failed ondata', str(e) print data conn.rollback() time.sleep(5)
def split(self, input_s): self.s = input_s self.token = jieba.tokenize(self.s) num_en = 0 num_zh = 0 for t in self.token: if not t[0].isspace(): if t[0] in ',,"\'‘’“”#@%<>《》{}【】[]。,!!??': self.symbol.append(t) else: lang = langid.classify(t[0])[0] if lang == "en": self.english.append(t) num_en += 1 elif lang == "zh": self.chinese.append(t) num_zh += 1 else: self.other.append(t) if num_en == 1 and num_zh == 1: code_mix = 1 if num_en == 0 and num_zh == 0: self.note = "other" elif num_en > num_zh: self.note = "en" self.translate_en_zh() else: self.note = "zh" self.translate_zh_en()
def extract(self, text: str, method: str) -> List[Extraction]: """ Args: text (str): any text, can contain HTML method (Enum[IdentificationTool.LANGID, IdentificationTool.LANGDETECT]): specifies which of the two algorithms to use Returns: List(Extraction): an extraction containing the language code used in the text. Returns the empty list of the extractor fails to identify the language in the text. """ if method == IdentificationTool.LANGID.name: language = classify(text)[0] return [Extraction(value=language, extractor_name=self.name)] elif method == IdentificationTool.LANGDETECT.name: try: language = detect(text) except: language = 'unknown' if language == 'unknown': return list() else: return [Extraction(value=language, extractor_name=self.name)] else: return list()
def process(self, tup): text = tup.values[1] language = langid.classify(text)[0] #l = LangID() #l.train() #language = l.classify(text) storm.emit([tup.values[0], language])
def comment_process(comment): #返回值 res = defaultdict(int) nltk_classifier = nltk_sentiment() #过滤规则 filter_res = filter(comment) if filter_res != 'true': fail[filter_res] += 1 return {} #计算表情 res['emoji'] = cal_emo(comment) #删除句子中的表情 comment = delete_emo(comment) #情感进行预测 if (len(comment) > 2): #语言进行预测 lan = langid.classify(comment) langage[lan[0]] += 1 ss = nltk_classifier.polarity_scores(comment) ss.pop('compound') #这里要过滤掉一些不显著的,即必须要大于0.5 max_key = max(ss, key=ss.get) if ss[max_key] > 0.5: res[max_key + '_num'] = 1 res[max_key + '_value'] = ss[max_key] return res
def detect_lang(text): """Returns language of input string text. :param: text: string :return: language: returns language code string, e.g. cs """ return unicode(langid.classify(text)[0])
def split_by_language(reviews): """ Split the reviews based on their language. input arguments: reviews: a list of review items output arguments: reviews_dict_languages: a dictionary with languages as keys, and a list of the corresponding reviews as value. """ # Initialization reviews_dict_languages = {} langid.set_languages(language_list) # Use a counter to visualize the progress count = 1 # Loop over all reviews for review in reviews: # Detect the language language = langid.classify(review.content)[0] #Store the review in the corresponding dictionary by language if language in reviews_dict_languages: reviews_dict_languages[language].append(review) else: reviews_dict_languages[language] = [] reviews_dict_languages[language].append(review) return reviews_dict_languages
def process_sentence(sentence): ''' Only process Chinese Sentence. ''' if langid.classify(sentence)[0] == 'zh': return segment_chinese_sentence(sentence) return sentence
def __init__(self, db='postgres:///novichenkobot', connection=None, keywords='inputs/keywords.txt', *args, **kwargs): super(GeneralSpider, self).__init__(*args, **kwargs) self.le = LinkExtractor() # load keywords dictionaries self.keywords = {} with open(keywords) as f: for line in f: lang, wordstr = line.split(':') words = [word.strip().lower() for word in wordstr.split(',')] self.keywords[lang] = words # langid lazily loads models for language identification, # and calling it here forces it to load the models now lang = langid.classify('test')[0] # database connection if connection is None: engine = sqlalchemy.create_engine( db, connect_args={ 'connect_timeout': 120, 'application_name': 'NovichenkoBot_GeneralSpider', }) self.connection = engine.connect() else: self.connection = connection
def extract_dict_ft_format_zh(file,out_file): '''multi-label zh''' with open(out_file,'w') as out_f: with open(file,'r') as in_f: for line in tqdm(in_f.readlines()): sent = line.split('\t',1) label_str,sent_comment = sent sent_comment = sent_comment.replace("\n",'') #sent_comment = clean_str(sent_comment) # use for en sent_comment = sent_comment.replace(" ","") #zh seg_line = jieba.cut(sent_comment)#zh sent_comment = clean_seg_coment(seg_line)#zh label = [] if label_str == 'NULL': final_line = '__label__' + 'NULL' + ' ' + sent_comment + '\n' out_f.write(final_line) else: label_dict = eval(label_str) flag = 0 #zh for k,v in label_dict.items(): hit_lang = langid.classify(v["hit"])[0] # zh if hit_lang != "zh": #zh flag = 1 # zh label.append('__label__'+v['label']) tmp_label = ' '.join([x for x in label]) final_line = tmp_label + ' ' + sent_comment + '\n' if flag == 0: #zh out_f.write(final_line)
def parse(text): lang, *_ = langid.classify(text) lines = text.strip().split('\n') output = [] day, month, year, time, place, date = [None]*6 for line in lines: line = line.strip() if line and DataParser.filter(line): output.append(line) pattern = re.compile('([0-9]+) ([A-Z]+) ([0-9]+) ([0-9]+:[0-9]+[AP]M) by PIB (.+)') matches = pattern.match(line) if matches: day, month, year, time, place = matches.groups() date_string = f'{month} {day} {year} {time}' date = datetime.strptime(date_string, '%b %d %Y %I:%M%p') content = '\n'.join(output) return { "lang": lang, "content": content, "date": date, "city": place }
def clean_train_data(train_data, min_ents=0, min_text_len=5, lang=['de']): """ removes items with no entities or fewer entities then min_ents :param train_data: A list of lists of spacy-like NER Tuple\ [(('some text'), entities{[(15, 19, 'place')]}), (...)] :param min_ents: An integer defining the minimum amount of entities. :min_text_len: An integer defining the minimum length of the textself. :lang: A list of language codes. If populated, only samples matching those languages will\ be included into the returned results. :return: A list of lists of spacy-like NER Tuple\ [(('some text'), entities{[(15, 19, 'place')]}), (...)] """ TRAIN_DATA = [] for x in train_data: try: ents = x[1] except TypeError: ents = None if ents and len(ents['entities']) >= min_ents and len( x[0]) >= min_text_len: TRAIN_DATA.append(x) if len(lang) > 0: TRAIN_DATA_LANG = [] for x in TRAIN_DATA: lng, prob = langid.classify(x[0]) if lng in lang: TRAIN_DATA_LANG.append(x) return TRAIN_DATA_LANG return TRAIN_DATA
def transcribe(text, lang=None, alphabet="IPA", syllabic_separator=u".", stress_mark=u"'", word_separator=u"|", auto_lang=False): """ Get the phonetic transcription of `text` :param text: unicode string to transcribe :param lang: string with the ISO 639-1 code or IETF language tag of `text` :param alphabet: string with the name of the phonetic alphabet to use :param syllabic_separator: string with the syllabic separator character :param stress_mark: string to mark the stress in words :param word_separator: string with the word separator character :param auto_lang: boolean to perform an automatic language identification :return: string with the phonetic transcription of `text` """ if auto_lang or not lang: if not langid: raise ImportError("Please, install langid") lang = langid.classify(text)[0] transcriptor = get_transcriptor( lang=lang, alphabet=alphabet, syllabic_separator=syllabic_separator, word_separator=word_separator, stress_mark=stress_mark) return transcriptor.transcribe( text=text, syllabic_separator=syllabic_separator, word_separator=word_separator, alphabet=alphabet, stress_mark=stress_mark, )
def clean(fileName): brandList = [] listFile = open(fileName, 'r') for line in listFile: if not line.startswith('#'): brandList.append(line.strip()) listFile.close() for brand in brandList: print(brand) userTweets = {} tweetFile = open('data/userTweets2/'+brand+'.json', 'r') for line in tweetFile: try: data = json.loads(line.strip()) except: continue userID = data['user'] if userID not in userTweets: userTweets[userID] = [] tweets = data['statuses'] for tweet in tweets: if len(tweet['text']) > 5: if langid.classify(tweet['text'])[0] == 'en': userTweets[userID].append(tweet) if len(userTweets[userID]) > 20: break tweetFile.close() outputFile = open('data/userTweets2/clean2/' + brand + '.json', 'w') for userID, tweets in userTweets.items(): if len(tweets) > 19: output = {'user_id': userID, 'statuses': tweets} outputFile.write(json.dumps(output)+'\n') outputFile.close()
def write(self, text): ''' writes input to a dictionary ''' en_list = [] # create a list to handle English input ru_list = [] # create a list for processing Russian input in_list = text.split(' ') # list for input string if not text: self.lbl.setText('NONE') return None for word in in_list: # we iterate over the entered words ch, coal = langid.classify(word) # ch stores the language of the word if ch == 'en' and word !='' and word != ' ': # if the word is in english en_list.append(word) # add to the list for English words elif ch == 'ru': # if the word is in Russian ru_list.append(word) # add to the list for Russian words if len(en_list) > 1 or '-' in text: # if in English not one word, but a phrase or a complex word with a hyphen str_en = ' '.join(str(e) for e in en_list) str_ru = ' '.join(str(e) for e in ru_list) self.di_ten[str_en]:[] = str_ru, '{:%Y-%m-%d %H:%M:%S}'.format(datetime.now()) # date added to the list of dictionary values for possible further sorting
def prediction(text): if os.path.exists(model_location): model = joblib.load(model_location) else: model = model_training() # lang_detected = detect(text) lang_detected = classify(text)[0] print(text) prediction = model.predict( laser.embed_sentences([text], lang=lang_detected)) probability = model.predict_proba( laser.embed_sentences([text], lang=lang_detected)) probability[0].sort() max_probability = max(probability[0]) # if (max_probability-0.35) > probability[0][-2]: if max_probability > 0.63: pred_output = prediction[0] else: pred_output = 'None' print('{}-------------->{}'.format(max(probability[0]), pred_output)) return ({ 'probability': max(probability[0]), 'output': pred_output, 'actual_output': prediction[0] })
def is_language(self, s, expected_lang): """ Check if the language of the segment cannot be reliably identified as another language. If another than the expected language is detected return False """ expected_lang = expected_lang.lower() if self.valid_languages: assert expected_lang in self.valid_languages if self.use_cld2: reliable, _text_bytes, details = cld2.detect( s.encode("utf-8"), isPlainText=True, useFullLangTables=True, bestEffort=True) if reliable: for _lang, langcode, confidence, score in details: if langcode == expected_lang and confidence >= 10: return True return False else: # unreliable is still counted as OK return True else: lang, confidence = langid.classify(source.lower()) if lang != expected_lang and confidence > 0.9: # confidence for wrong language higher than 90% return False else: return True
def check_german(tweet_text): if isinstance(tweet_text, unicode) is False: tweet_text = unicode(tweet_text, 'utf-8') tweet_text = tweet_text.encode('utf-8') tokens = tweet_text.split(' ') new_text = '' #delete @username for token in tokens: if '@' not in token: new_text += token + ' ' new_text = new_text.lower() if isinstance(new_text, unicode) is False: text = unicode(new_text, 'utf-8') text = text.encode('utf-8') else: text = new_text.encode('utf-8') top_language_name = cld.detect(text) lang_form_langid = langid.classify(text) if new_text == '': return True #if text empty - german if top_language_name[0] == 'GERMAN' or lang_form_langid[0] == 'de': return True else: return False
def on_data(self, data): if time.time() >= self.started + self.duration: stats = open('{0}-sample.stats'.format(int(self.started)), 'w+') stats.write("================= STATISTICS =================" + "\n") stats.write("Start time: " + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.started)) + "\n") stats.write("End time: " + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + "\n") stats.write("First Tweet ID: " + self.first_tweet_id + "\n") stats.write("Last Tweet ID: " + self.last_tweet_id + "\n") stats.write("Language: " + self.lang + "\n") stats.write("Language classification threshold: " + str(self.lang_threshold) + "\n") stats.write("Above threshold: " + str(self.counter[self.lang + '-above']) + "\n") stats.write("Below threshold: " + str(self.counter[self.lang + '-below']) + "\n") stats.write("Exluded: " + str(self.counter['excluded']) + "\n") return False elif 'in_reply_to_status_id' in data: status = Status.parse(self.api, json.loads(data)) langclass = langid.classify(status.text) if (self.counter == {self.lang + '-above':0, self.lang + '-below':0, 'excluded':0}): self.first_tweet_id = str(status.id) self.last_tweet_id = str(status.id) if (langclass[0] == self.lang): if langclass[1] >= self.lang_threshold: self.above_output.write(data) self.counter[self.lang + '-above'] += 1 else: self.below_output.write(data) self.counter[self.lang + '-below'] += 1 else: self.excl_output.write(data) self.counter['excluded'] += 1 return True
def gettinglan(self,text): lanre=langid.classify(text) language= lanre[0] certainty=lanre[1] return language,certainty
def extract_tweet(filename, min_num_chars=10): with open(filename, 'r') as tweetfile: for tweet in tweetfile: tweettext = parser.get_tweet_text(tweet,remove_urls=True, remove_retweets=True, remove_usernames=True) if(len(tweettext) > min_num_chars): tweetlang = langid.classify(tweettext) print tweettext[0:50] if len(tweettext) > 50 else tweettext , " : ", tweetlang , '\n'
def apply(self, text, evaluation): 'LanguageIdentify[text_String]' import langid # see https://github.com/saffsd/langid.py # an alternative: https://github.com/Mimino666/langdetect import pycountry code, _ = langid.classify(text.get_string_value()) language = pycountry.languages.get(iso639_1_code=code) return String(language.name)
def filter_language (df): df['lang'] = '' df['prob'] = '' for index, row in df.iterrows(): df['lang'][index], df['prob'][index] = langid.classify(row['text']) return df[df.lang == 'en']
def parse_article(self, url, html): rdoc = readability.Document(html) summary = rdoc.summary() lang_id, _ = langid.classify(summary) article = newspaper.Article(url, config=self.config, language=lang_id) article.set_html(html) article.parse() return article