def main():
  s=sys.stdin.readlines()
  test_content=""
  if len(s)<10:
    test_content=" ".join(s)
  else:
    test_content=" ".join(s[:10])
  print langid.classify(test_content)[0]
Example #2
0
	def process_tu(self, tu, num_of_finished_scans):
		src_lang = langid.classify(tu.src_phrase)[0]
		trg_lang = langid.classify(tu.trg_phrase)[0]

		if src_lang != self.src_language and src_lang not in self.src_language:
			return [0]
		if trg_lang != self.trg_language and trg_lang not in self.trg_language:
			return [0]
		return [1]
Example #3
0
def  detect_language(filename):
    with codecs.open(filename=filename, mode="r", encoding="utf-8") as fd:
        while True:
            line = fd.readline().strip("\n")
            print line
            if not line:
                break
            title = line.split(':')[1]
            print langid.classify(title), ":", title
Example #4
0
	def decide(self, tu):
		src_lang = langid.classify(tu.src_phrase)[0]
		trg_lang = langid.classify(tu.trg_phrase)[0]

		if src_lang != self.src_language and src_lang not in self.src_language:
			return 'reject'
		if trg_lang != self.trg_language and trg_lang not in self.trg_language:
			return 'reject'

		return 'accept'
Example #5
0
def fix_encoding(expected_langs, text, data):
    detected_lang, _confidence = langid.classify(text)
    if detected_lang not in expected_langs:
        enc = chardet.detect(text.encode("raw_unicode_escape"))
        sys.stderr.write(str(enc))
        if enc["encoding"] != "ascii":
            fixed_text = text.encode("raw_unicode_escape").decode(enc["encoding"])
            detected_lang, _confidence = langid.classify(fixed_text)
            if detected_lang in expected_langs:
                fixed_data = data.decode("utf-8").encode("raw_unicode_escape").decode(enc["encoding"]).encode("utf-8")
                return fixed_data
    return data
Example #6
0
def init_languages():
    # This takes some time to load:
    global language_support

    # Imported modules are just variables - names binded to some values. So all you need is to import them and make them global with global keyword.
    global langid
    import langid
    
    # I have to do that for the library to initialize, which takes some time
    langid.classify("test")

    language_support = True
    print "Language identification support loaded"
Example #7
0
def remove_punct_noneng(df):
    # punctuation removals on @, # and all the symbols
    punctuation_remove = string.punctuation
    punctuation_remove = punctuation_remove.replace('@', '')
    punctuation_remove = punctuation_remove.replace('#', '')
    df['text'] = df['text'].str.replace('[{}]'.format(punctuation_remove), '')
    list_to_remove = ["\r", "\n", "–", "“", "”", "…", "‘", "’", "•"]

    df['text'] = [re.sub(r"#\w+", "", str(x)) for x in df['text']]
    df['text'] = [re.sub(r"@\w+", "", str(x)) for x in df['text']]
    df['text'] = [re.sub("—", " ", str(x))
                  for x in df['text']]  #replace - with space
    df["text"] = [re.sub('\s+', ' ', str(x)) for x in df["text"]
                  ]  #remove more than 2 consec spaces with just one space

    for elem in list_to_remove:
        df["text"] = df["text"].str.replace(elem, "")

    df["text"] = df["text"].str.lower()

    # remove all rows with foreign language characters
    for index, row in df.iterrows():
        text = row['text']
        # check for null text
        empty = text is np.nan or text != text
        if not empty:
            if len(text) >= 3:
                lang, _ = langid.classify(text)
                if lang != "en":
                    df.drop(index, inplace=True)
    return (df)
Example #8
0
def ask(search: str):
    interim = elasticsearch_client.search(index=DB_INDEX_AUTOCOMPLETE, body=
    {
        '_source':['text'],
        'query':{
            "bool": {
                "must": [{
                    "match": {
                        "text": search
                    }
                },
                    {
                        "exists": {
                            "field": "count"
                        }
                    }]
            }
        },
        'size': 10,
        'sort' :[
                {'count' : {'order' : 'desc' }}
        ]
    })

    resultCount = len(interim['hits']['hits'])
    result = []
    for i in range(resultCount):
        result.append(interim['hits']['hits'][i]['_source']['text'])

    lang, score = langid.classify(search)

    return {
            "results":result,
            "language": lang
        }
Example #9
0
    def score(
            self, hypothesis: List[str], references: List[List[str]],
            tags: Optional[List[List[str]]] = None
    ) -> VizSeqScore:
        corpus_score, sent_scores, group_scores = None, None, None

        import bert_score as bs
        import langid
        import logging
        logging.getLogger('pytorch_pretrained_bert').setLevel(logging.WARNING)
        logging.getLogger('langid').setLevel(logging.WARNING)

        lang = langid.classify(references[0][0])[0]

        sent_scores = bs.score(
            hypothesis, references[0], nthreads=self.n_workers, lang=lang,
            verbose=self.verbose
        )[2].tolist()

        if self.corpus_level:
            corpus_score = np.mean(sent_scores)

        if tags is not None:
            tag_set = self._unique(tags)
            group_scores = {}
            for t in tag_set:
                indices = [i for i, cur in enumerate(tags) if t in cur]
                group_scores[t] = np.mean([sent_scores[i] for i in indices])

        return VizSeqScore.make(
                corpus_score=corpus_score, sent_scores=sent_scores,
                group_scores=group_scores
            )
def check_lang(data_str):
    predict_lang = langid.classify(data_str)
    if predict_lang[1] >= .9:
        language = predict_lang[0]
    else:
        language = 'NA'
    return language
Example #11
0
def user_langid(candidate):
  print 'Performing langid by user'
  conn=sqlite3.connect(config.DB)
  c=conn.cursor()
  users=set()
  user_text={}
  from langid import classify
  c.execute('SELECT user,tweet FROM tweets')
  count=0
  for user,status in c.fetchall():
    count+=1
    if count%100000==0:
      print 'Processed:',count
    if user not in candidate:
      continue
    status=pickle.loads(str(status))
    if user not in user_text:
      user_text[user]=[]
    user_text[user].append(space_re.sub(' ',remove_specific_re.sub(' ',status.text)).strip())
  count=0
  lang_distr={}
  for user in user_text:
    count+=1
    if count%1000==0:
      print 'Users:',count,'/',len(user_text)
    lang=classify(' '.join(user_text[user]))[0]
    if lang in config.LANGS:
      users.add(user)
    lang_distr[lang]=lang_distr.get(lang,0)+1
  print len(users)
  print sorted(lang_distr.items(),key=lambda x:-x[1])
  conn.close()
  print 'Users:',len(users),'/',len(user_text)
  return users
Example #12
0
File: single.py Project: NLeSC/xtas
def movie_review_emotions(doc, **kwargs):
    """Emotion (fine-grained sentiment) tagger for movie reviews.

    The training data for this function is that of Buitinck et al., with the
    training and test data concatenated. The algorithm is SVMs in a binary
    relevance (one-vs-rest) combination. You may use the training data
    (and this function) for academic/research purposes only. Add a parameter
    for_academic_research=True if you accept the license.

    Returns
    -------
    tagged : list of (string, list of string)
        A list of (sentence, labels) pairs. Each sentence may have zero or
        more labels.

    References
    ----------
    L. Buitinck, J. van Amerongen, E. Tan and M. de Rijke (2015).
    Multi-emotion detection in user-generated reviews. Proc. ECIR.
    https://staff.fnwi.uva.nl/m.derijke/wp-content/papercite-data/pdf/buitinck-multi-emotion-2015.pdf
    """
    if not (kwargs.get("for_academic_research", False) or kwargs.get("unittest", False)):
        raise RuntimeError(
            "This functionality is only available for"
            " academic research. Please use movie_review_emotions(doc,"
            " for_academic_research=True) to use this function for"
            " that purpose."
        )

    from ._emotion import classify

    nltk_download("punkt")
    sentences = pipe(doc, fetch, nltk.sent_tokenize)
    return list(zip(sentences, classify(sentences)))
Example #13
0
	def filter_language(self, domain):

		filter_lang_set= set(["en", "es"])

		if langid.classify(domain)[0] in filter_lang_set:
			return True
		return False
def is_english(text):
    """ if the html text is english """

    lang = langid.classify(text)
    if lang and 'en' in lang[0]:
        return True
    return False
Example #15
0
def format_tweets():
    f = open("../twitter_scraper/28_04_2014_scrape.csv", "r")
    w = open("parsed_tweets_scraped.csv", "w")
    while 1:
        file_content = f.readlines(10000)
        if not file_content:
            break
        file_content = process_file_content(file_content)
        non_dup = list(set(file_content))
        for line in non_dup:
            tup = langid.classify(line)
            if "en" in tup:
                if "diabetes" in line:
                    line = line.replace("'", "").strip()
                    line = line.replace('"', "")
                    if not line.startswith('"RT'):
                        if not "http" in line and not "https" in line:
                            w.write(line + "\n")
                        else:
                            line = re.sub(
                                r"""(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""",
                                "",
                                line,
                            )
                            w.write(line + "\n")
    f.close()
    w.close()
def _get_date_info(key_minutes: str, date_line: str, verbose: bool = False) -> dict:
    funcs = get_parse_functions(langid.classify(date_line)[0])

    result = {}
    _ = extract_time(date_line)
    result[key_minutes] = _[0] if _ else None

    if verbose:
        print('')

    for fun in funcs:
        try:
            if verbose:
                print("Parse %s: %s" % (fun.__name__, fun(date_line)[0]))
            _ = fun(date_line)[0]
            _1 = datetime.now()

            if str(_) == _1.strftime("%Y-%m-%d %H:%M:%S"):
                continue
            if isinstance(_, datetime) and _.year > _1.year:
                continue

            result[fun.__name__] = _
        except AttributeError as e:
            # if verbose:
            #     print("Run '%s', error - %s" % (fun.__name__, e))
            pass
    return result
Example #17
0
def process(web_resource):
    text = web_resource.get_text()
    if text:
        (lang, confidence) =  langid.classify(text)
        if confidence >= 0.9:
            return web_resource.url, lang
    return web_resource.url, 'unknown'
Example #18
0
	def filter_language(self, domain):

		filter_lang_set= set(["en", "es"])

		if langid.classify(domain)[0] in filter_lang_set:
			return True
		return False
Example #19
0
    def __init__(self, filename):
        """
        Parameters
        ----------
        filename: str
            Path to the (plaintext) file for this document.

        """
        # open file with utf-8-sig to remove any BOMs
        with open(filename, "r", encoding="utf-8-sig", errors="ignore") as infile:
            self.string = clean_whitespace(infile.read())

        self.lang = langid.classify(self.string)[0]
        if self.lang == 'de':
            self._stemmer = GermanStemmer()
        elif self.lang == 'en':
            self._stemmer = EnglishStemmer()
        else:
        	print("no stemmer for '{}'".format(self.lang))
        	print("falling back to 'de'...")
        	self._stemmer = GermanStemmer()

        self.name = os.path.splitext(os.path.split(filename)[1])[0]
        self.ID = self.name
        self.tokens = word_tokenize(self.string)
        self.stems = list(map(self.stem, self.tokens))
        self.length = len(self.tokens)
        self.hashes = list(map(hash, self.stems))
        self.sents = self._get_sents()
        self.freq_dist = dict(Counter(self.hashes))
Example #20
0
def _guess_language_of_text(text):
    """ Guess the language of a string.

    Returns a dict of type multilanguage text, if possible with the correct language attribution to the text.
    """
    probables = ["en", "de", "fr"]
    text = str(text)
    predefined = _get_predefined_multilanguage_string(text)
    if predefined:
        return predefined
    try:
        lang_txtblob = TextBlob(text).detect_language()
        if lang_txtblob in probables:
            return {lang_txtblob: text}
    except Exception:
        lang_txtblob = None
    try:
        guess = guess_language.guess_language(text, probables)
        if guess in probables:
            return {guess: text}
        if not isinstance(guess, str):
            guess = None
    except Exception as e:
        guess = None
    try:
        la_id, _ = langid.classify(text)
        det = detect(text)
        if det and la_id and det == la_id and det in probables:
            return {det: text}
    except Exception as e:
        det = None
    return {"XX": text}
Example #21
0
    def compute_sentiment(self, message):
        if message.language == 'en':  # helps preventing mis-identified english language
            message.language = langid.classify(message.message)[0]

        native_language = iso639_1_to_native.get(message.language, 'english') \
            if hasattr(message, 'language') else 'english'
        iso_639_2 = iso639_1_to_2.get(message.language, 'eng') \
            if hasattr(message, 'language') else 'eng'

        words = self.nltk.tweet_tokenize(message.message.encode('utf8'), True,
                                         True, True)
        # print "native language is", native_language
        no_stopwords = self.nltk.remove_stopwords(words, native_language)
        word_sentiments = [
            senti for senti in map(
                lambda word: self.compute_word_sentiment(word, iso_639_2),
                no_stopwords) if senti is not None
        ]

        return {
            'pos':
            round(
                sum(map(lambda s: s['pos'], word_sentiments)) /
                len(word_sentiments), 3),
            'neg':
            round(
                sum(map(lambda s: s['neg'], word_sentiments)) /
                len(word_sentiments), 3),
            'obj':
            round(
                sum(map(lambda s: s['obj'], word_sentiments)) /
                len(word_sentiments), 3)
        } if any(word_sentiments) else None
Example #22
0
def langsplit(filename, text):
    cmd = [
        "/home/buck/net/build/mtma_bitext//html_convert/langsplit",
        "--printchunks"]
    proc = Popen(cmd, stdin=PIPE, stdout=PIPE)
    tld = filename.split("/")[0].split(".")[0]
    header = "%s tld:%s uri:%s\n" % (magic_numer, tld, filename)
    proc.stdin.write(header)
    proc.stdin.write(text.encode("utf-8"))
    proc.stdin.write("\n")
    output = proc.communicate()[0]
    if not output.strip():
        # sys.stderr.write("writing debug file.\n")
        # f = open("debug", "w")
        # f.write(header)
        # f.write(text.encode("utf-8"))
        # f.close()

        res = langid.classify(text)
        lang = res[0]
        header = "%s\tlanguage:%s\tbytes:%d\n" % (header.rstrip(),
                                                  lang,
                                                  len(text.encode("utf-8")))
        return header + text
    return output
Example #23
0
def language_processing():
    path = '/Users/ze/Documents/PycharmProjects/Data/Instagram/'  # MacBookPro
    df = pd.read_csv(path + 'comment.csv', nrows=10)

    df_text = df['text'].str.replace('[’·°–!"#$%&\'()*+,'
                                     '-./:;<=>?@,。?★、…【】()《》?“”‘’![\\]^_`{|}~]+',
                                     " ", regex=True)
    #df_text = df_text.apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
    df_text = df_text.apply(lambda x: x.lower())

    df_language_1 = pd.DataFrame()
    df_language_2 = pd.DataFrame()
    df_language_openai = pd.DataFrame()

    for index, item_iter in df_text.iteritems():
        #df_text.iloc[index] = deEmojify(item_iter)
        try:
            lg = detect(item_iter)
        except:
            lg = 'unknown'
        df_language_1.loc[index, 0] = lg

        try:
            lg = langid.classify(item_iter)
        except:
            lg = 'unknown'
        df_language_2.loc[index, 0] = lg[0]

        lg = detect_language(item_iter)
        df_language_openai.loc[index, 0] = lg['answers'][0]

    df_text = pd.concat([df_text, df_language_1, df_language_2,
                         df_language_openai], axis='columns', sort=False)
    df_text.columns = (['text', 'langdetect', 'langid', 'openai'])
Example #24
0
def googlebooks_scr(parsed_url, date_format='%Y-%m-%d') -> tuple:
    """Create the response namedtuple."""
    parsed_query = parse_qs(parsed_url.query)

    id_ = parsed_query.get('id')
    if id_ is not None:
        volume_id = id_[0]
    else:  # the new URL format
        volume_id = parsed_url.path.rpartition('/')[2]

    dictionary = ris_parse(
        request(
            f'https://{parsed_url.netloc}/books/download/?id={volume_id}'
            f'&output=ris',
            spoof=True).content.decode('utf8'))
    dictionary['date_format'] = date_format
    # manually adding page number to dictionary:
    pg = parsed_query.get('pg')
    if pg is not None:
        pg0 = pg[0]
        dictionary['page'] = pg0[2:]
        dictionary['url'] += f'&pg={pg0}'
    # although google does not provide a language field:
    if not dictionary['language']:
        dictionary['language'] = classify(dictionary['title'])[0]
    return dict_to_sfn_cit_ref(dictionary)
def detect_lang(text1: str, checklen: int = 3000) -> str:
    '''
    Detect Chinese and other languages.

    return a string lang for use in seg_sent(text, lang=lang)
    '''

    if not text1:
        return 'english'

    text0 = text1[:checklen]

    detected = 'en'
    try:
        detected = langid.classify(text0)[0]
    except Exception as exc:
        LOGGER.warning(" langid.classify failed: %s, falls back eo english",
                       exc)

    # if detected not in ['zh', 'en', 'fr', 'it', 'de', 'pt', 'es']: detected = 'en'

    # lang_dict[detected]

    try:
        lang = pycountry.languages.get(alpha_2=detected).name.lower()
    except Exception as exc:
        LOGGER.warning('Unable to detect, %s, setting to english', exc)
        lang = 'english'

    return lang
Example #26
0
def filter_language(docs, attribute, lang='en', filter_empty=True):
    """ Filter document collection on language.

    Parameters
    ----------
    docs : Sequence[epo_utils.documents.ExchangeDocument]
        Documents to filter.
    attribute : str
        Text attribute to filter on.
    lang : str
        Language code, default "en"
    filter_empty : bool
        If True, empty strings will be filtered as well.

    Yields
    ------
    api.ops.documents.ExchangeDocument
    """
    for doc in docs:
        text = getattr(doc, attribute)

        if filter_empty and not text:
            continue

        elif text:
            if langid.classify(text) == lang:
                yield doc

        elif not text:
            yield doc
Example #27
0
    def tag_lang_pair(cls, src: str, ref: Optional[str]) -> List[str]:
        machine_tags = []
        src_lang = langid.classify(src)[0]
        ref_lang = None if ref is None else langid.classify(ref)[0]
        if ref is not None and src_lang == ref_lang:
            machine_tags.append(f'lang: {ref_lang}')
        else:
            machine_tags.append(f'src_lang: {src_lang}')
            if ref is not None:
                machine_tags.append(f'trg_lang: {ref_lang}')

        if ref_lang is not None \
                and ref_lang in cls.POTENTIAL_UNSEGMENTED_LANGUAGES \
                and ref.find(' ') == -1:
            machine_tags.append('unsegmented_trg')
        return machine_tags
Example #28
0
    def translate(self, inputFile, outputFile, lik, ssize):
        try:
            fin = inputFile.decode('utf-8')
        except Exception:
            pass
        lineTuple = langid.classify(inputFile)  #调用langid来对该行进行语言检测
        if lineTuple[0] in lik[0] or lineTuple[
                0] in self.countrylist:  #如果该行语言大部分为中文
            if lineTuple[0] not in lik[0]:
                countr = lik[1]
                outurl = outputFile.split('/')
                outurl[-3] = countr
                outurlstr = '/'.join(outurl[:-3])
                sitesize = PathSize().GetPathSize(outurlstr)  # M
                if float(sitesize) >= float(ssize):
                    return True

                outurlFile = '/'.join(outurl)
            p = re.compile(r'[\n]+')
            with codecs.open(outputFile, 'w', "utf-8") as fout:  #以写的方式打开输出文件
                try:
                    fout.writelines(p.sub('\n', inputFile))
                except Exception:
                    fout.writelines(p.sub('\n', fin))
            return True
        else:
            logger.error('文件内容的语言%s和想获取的文章的语言%s不符合!' %
                         (lineTuple[0], ','.join(lik[0])))
            return False
def langCheck(us):
	# filter out names in the format of Axx Bxx.
	p = re.compile("[A-Z][a-z]* [A-Z][a-z]*")
	clean_us = p.sub("", us)

	engFlag = 0
	chiFlag = 0
	for ch in clean_us:
		if u'\u3040' <= ch <= u'\u309f' or \
			u'\u30a0' <= ch <= u'\u30ff':
			return 3
		if u'A' <= ch <= u'Z' or u'a' <= ch <= u'z':
			engFlag = 1
		if u'\u4e00' <= ch <= u'\u9fff' or \
			u'\uac00' <= ch <= u'\ud7af':
			chiFlag = 1
		if engFlag == 1 and chiFlag == 1:
			break

	# return 0 for English, 1 for none-English,
	# 2 for both, 3 for other (Japanese & Spanish)
	if engFlag == 1 and chiFlag == 1:
		return 2
	elif chiFlag == 1:
		return 1
	else:
		if langid.classify(us)[0] == 'es':
			print us
			return 3
		else:
			return 0
def check_german(tweet_text):
    emoji_key = pandas.read_csv('DATA/emoji_table.txt', encoding='utf-8', index_col=0)
    emoji_key['count'] = 0
    emoji_dict = emoji_key['count'].to_dict()
    emoji_dict = emoji_key['count'].to_dict()
    emoji_dict_total = emoji_key['count'].to_dict()
    emoji_list = emoji_dict.keys()


    tweet_text = unicode(tweet_text, 'utf-8')
    tweet_text = tweet_text.encode('utf-8')
    tokens = tweet_text.split(' ')
    new_text = ''
    #delete @username
    for token in tokens:
        if '@' not in token:
            new_text += token + ' '
    new_text = new_text.lower()
    text = unicode(new_text, 'utf-8')
    text = text.encode('utf-8')
    top_language_name = cld.detect(text)
    lang_form_langid = langid.classify(text)
    if new_text == '':
        return True
    #if text emty - german
    if top_language_name[0] == 'GERMAN' or lang_form_langid[0] == 'de':
        return True
    else:
        return False
def detectLanguage(df: pd.DataFrame):
    """
    Given a DataFrame of conversations grouped, detect the language of texts by using langdetect and langid.
    Use a rule to compare their results and make the decision.
    :param df: the DataFrame where a single row consists of only a conversation
    :return: two lists, one stores all conversations in English while the other stores those in other languages
    """
    print("\nDetecting languages...")
    # langdetect (https://github.com/Mimino666/langdetect) works better on long texts, which supports:
    # af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he,
    # hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl,
    # pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi, zh - cn, zh - tw
    df['lang1'] = df.apply(lambda row: langdetect.detect(row['body']), axis=1)
    # langid (https://github.com/saffsd/langid.py) works better on short texts,
    # and performs way better when a consideration set is given, which supports:
    # af, am, an, ar, as, az, be, bg, bn, br, bs, ca, cs, cy, da, de, dz, el, en, eo, es, et, eu, fa, fi, fo,
    # fr, ga, gl, gu, he, hi, hr, ht, hu, hy, id, is, it, ja, jv, ka, kk, km, kn, ko, ku, ky, la, lb, lo, lt,
    # lv, mg, mk, ml, mn, mr, ms, mt, nb, ne, nl, nn, no, oc, or, pa, pl, ps, pt, qu, ro, ru, rw, se, si, sk,
    # sl, sq, sr, sv, sw, ta, te, th, tl, tr, ug, uk, ur, vi, vo, wa, xh, zh, zu
    df['lang2'] = df.apply(lambda row: langid.classify(row['body'])[0], axis=1)
    nonEN = []
    # 'lang1' and 'lang2' are now 2 new columns at right hand
    for reset_index, row in df.iterrows():
        # define the criteria here
        # since 'lang2' tends to be more reliable, so the rule checks the result by 'lang2' first
        # and see whether it is consistent with that given by 'lang1'
        if row['lang2'] != 'en' and row['lang1'] == row['lang2']:
            nonEN.append(row['conversationID'])
    print("There are " + str(len(nonEN)) + " chats in other languages.")
    # separate the DataFrame into two, the English and non-English
    chatsEN = df[df.apply(lambda row: row['conversationID'] not in nonEN,
                          axis=1)]
    chatsNonEN = df[df.apply(lambda row: row['conversationID'] in nonEN,
                             axis=1)]
    return chatsEN, chatsNonEN
Example #32
0
def process(self, instance, verbose=True, commit=True, **kwargs):
    """ See source code. """

    instance_name = instance._meta.verbose_name
    instance_id = instance.id

    try:
        language, probability = langid.classify(instance.content)

    except:
        LOGGER.exception(u'Exception while detecting language of %s %s',
                         instance_name, instance_id)
        return

    if language:
        if probability > 0.8:
            instance.language = models.Language.get_by_code(language)

            if verbose:
                LOGGER.info(u'language-detector: set %s %s language to %s '
                            u'(confidence: %s).', instance_name, instance_id,
                            instance.language, probability)

            if commit:
                instance.save()

        else:
            LOGGER.warning(u'language-detector: Confidence too low (%s) '
                           u'to set a language on %s %s based on its '
                           u'content.', probability,
                           instance_name, instance_id)

    else:
        LOGGER.warning(u'language-detector: No language detected in %s %s '
                       u'content.', instance_name, instance_id)
Example #33
0
def check_word(word, article_url, word_context):
    time.sleep(1)
    print(word)
    client.captureMessage("API Checking Word", extra={
        'word': word,
    })

    if not check_api(word):
        client.captureMessage("API Rejection",
                              extra={
                                  'word': word,
                                  'word_context': word_context,
                              })
        return

    language, confidence = langid.classify(word_context)

    if language != 'en':
        client.captureMessage("Language Rejection",
                              extra={
                                  'word': word,
                                  'word_context': word_context,
                                  'confidence': confidence
                              })
        return

    if int(r.get("recently") or 0) < 8:
        r.incr("recently")
        r.expire("recently", 60 * 30)
        tweet_word(word, article_url, word_context)
    else:
        client.captureMessage("Recency Rejection", extra={'word': word})
Example #34
0
def langsplit(filename, text):
    cmd = [
        "/home/buck/net/build/mtma_bitext//html_convert/langsplit",
        "--printchunks"
    ]
    proc = Popen(cmd, stdin=PIPE, stdout=PIPE)
    tld = filename.split("/")[0].split(".")[0]
    header = "%s tld:%s uri:%s\n" % (magic_numer, tld, filename)
    proc.stdin.write(header)
    proc.stdin.write(text.encode("utf-8"))
    proc.stdin.write("\n")
    output = proc.communicate()[0]
    if not output.strip():
        # sys.stderr.write("writing debug file.\n")
        # f = open("debug", "w")
        # f.write(header)
        # f.write(text.encode("utf-8"))
        # f.close()

        res = langid.classify(text)
        lang = res[0]
        header = "%s\tlanguage:%s\tbytes:%d\n" % (header.rstrip(), lang,
                                                  len(text.encode("utf-8")))
        return header + text
    return output
Example #35
0
 def __extractEnglishSentences(self, rawtext):
     """Function to extract the English sentences in a string
     :param rawtext: string that we want to clean
     """
     sentences = sent_tokenize(rawtext, 'english')
     return ' '.join(
         [el for el in sentences if langid.classify(el)[0] == 'en'])
Example #36
0
    def on_data(self, data):
        try:
            tweet = data.split(',"text":"')[1].split('","source')[0]
            tweet = re.sub(r'\shttps?:(.+)', '', tweet)
            lang = langid.classify(tweet)[0]
            if lang == 'en':
                created = int(time.time())
                created_adj = round_ten_min(created)
                clean_tweet = re.sub("^rt\s@.+?:\s", "", tweet.lower())
                clean_tweet = re.sub("[^\w\s]", "", clean_tweet)
                sent = nb.predict(cv.transform([clean_tweet]))[0]
                prob = nb.predict_proba(
                    cv.transform([rem_stop_words(clean_tweet)]))[0][0]
                cur.execute("INSERT INTO mctweets (time, time_adj, text, text_cleaned, \
		sents, prob) VALUES (%s, %s, %s, %s, %s, %s)"                                                             , (created, created_adj, \
                tweet, clean_tweet, sent, prob))
                cur.execute("DELETE FROM mctweets WHERE time < %s;",
                            [created - 90001])
                conn.commit()
            return True
        except BaseException, e:
            print 'failed ondata', str(e)
            print data
            conn.rollback()
            time.sleep(5)
Example #37
0
 def split(self, input_s):
     self.s = input_s
     self.token = jieba.tokenize(self.s)
     num_en = 0
     num_zh = 0
     for t in self.token:
         if not t[0].isspace():
             if t[0] in ',,"\'‘’“”#@%<>《》{}【】[]。,!!??':
                 self.symbol.append(t)
             else:
                 lang = langid.classify(t[0])[0]
                 if lang == "en":
                     self.english.append(t)
                     num_en += 1
                 elif lang == "zh":
                     self.chinese.append(t)
                     num_zh += 1
                 else:
                     self.other.append(t)
     if num_en == 1 and num_zh == 1:
         code_mix = 1
     if num_en == 0 and num_zh == 0:
         self.note = "other"
     elif num_en > num_zh:
         self.note = "en"
         self.translate_en_zh()
     else:
         self.note = "zh"
         self.translate_zh_en()
Example #38
0
    def extract(self, text: str, method: str) -> List[Extraction]:
        """

        Args:
            text (str): any text, can contain HTML
            method (Enum[IdentificationTool.LANGID, IdentificationTool.LANGDETECT]): specifies which of the two
            algorithms to use

        Returns:
            List(Extraction): an extraction containing the language code used in the text. Returns the empty list of
            the extractor fails to identify the language in the text.

        """
        if method == IdentificationTool.LANGID.name:
            language = classify(text)[0]
            return [Extraction(value=language, extractor_name=self.name)]

        elif method == IdentificationTool.LANGDETECT.name:
            try:
                language = detect(text)
            except:
                language = 'unknown'

            if language == 'unknown':
                return list()
            else:
                return [Extraction(value=language, extractor_name=self.name)]

        else:
            return list()
Example #39
0
 def process(self, tup):
   text = tup.values[1]
   language = langid.classify(text)[0]
   #l = LangID()
   #l.train()
   #language = l.classify(text)
   storm.emit([tup.values[0], language])
Example #40
0
def comment_process(comment):

    #返回值
    res = defaultdict(int)
    nltk_classifier = nltk_sentiment()

    #过滤规则
    filter_res = filter(comment)
    if filter_res != 'true':
        fail[filter_res] += 1
        return {}

    #计算表情
    res['emoji'] = cal_emo(comment)
    #删除句子中的表情
    comment = delete_emo(comment)
    #情感进行预测
    if (len(comment) > 2):

        #语言进行预测
        lan = langid.classify(comment)
        langage[lan[0]] += 1

        ss = nltk_classifier.polarity_scores(comment)

        ss.pop('compound')
        #这里要过滤掉一些不显著的,即必须要大于0.5
        max_key = max(ss, key=ss.get)
        if ss[max_key] > 0.5:
            res[max_key + '_num'] = 1
            res[max_key + '_value'] = ss[max_key]

    return res
Example #41
0
def detect_lang(text):
    """Returns language of input string text.

    :param: text: string
    :return: language: returns language code string, e.g. cs
    """
    return unicode(langid.classify(text)[0])
Example #42
0
def split_by_language(reviews):
    """
    Split the reviews based on their language.
    input arguments:
        reviews: a list of review items
    output arguments:
        reviews_dict_languages: a dictionary with languages as keys, 
                                and a list of the corresponding reviews as value.
    """

    # Initialization
    reviews_dict_languages = {}
    langid.set_languages(language_list)

    # Use a counter to visualize the progress
    count = 1

    # Loop over all reviews
    for review in reviews:

        # Detect the language
        language = langid.classify(review.content)[0]

        #Store the review in the corresponding dictionary by language
        if language in reviews_dict_languages:
            reviews_dict_languages[language].append(review)
        else:
            reviews_dict_languages[language] = []
            reviews_dict_languages[language].append(review)

    return reviews_dict_languages
Example #43
0
def process_sentence(sentence):
    '''
    Only process Chinese Sentence.
    '''
    if langid.classify(sentence)[0] == 'zh':
        return segment_chinese_sentence(sentence)
    return sentence
Example #44
0
    def __init__(self,
                 db='postgres:///novichenkobot',
                 connection=None,
                 keywords='inputs/keywords.txt',
                 *args,
                 **kwargs):
        super(GeneralSpider, self).__init__(*args, **kwargs)
        self.le = LinkExtractor()

        # load keywords dictionaries
        self.keywords = {}
        with open(keywords) as f:
            for line in f:
                lang, wordstr = line.split(':')
                words = [word.strip().lower() for word in wordstr.split(',')]
                self.keywords[lang] = words

        # langid lazily loads models for language identification,
        # and calling it here forces it to load the models now
        lang = langid.classify('test')[0]

        # database connection
        if connection is None:
            engine = sqlalchemy.create_engine(
                db,
                connect_args={
                    'connect_timeout': 120,
                    'application_name': 'NovichenkoBot_GeneralSpider',
                })
            self.connection = engine.connect()
        else:
            self.connection = connection
Example #45
0
def check_lang(data_str):
    predict_lang = langid.classify(data_str)
    if predict_lang[1] >= .9:
        language = predict_lang[0]
    else:
        language = 'NA'
    return language
Example #46
0
def extract_dict_ft_format_zh(file,out_file):
    '''multi-label zh'''
    with open(out_file,'w') as out_f:
        with open(file,'r') as in_f:
            for line in tqdm(in_f.readlines()):
                sent = line.split('\t',1)
                label_str,sent_comment = sent
                sent_comment = sent_comment.replace("\n",'')
                #sent_comment = clean_str(sent_comment)   # use for en
                sent_comment = sent_comment.replace(" ","") #zh
                seg_line = jieba.cut(sent_comment)#zh
                sent_comment = clean_seg_coment(seg_line)#zh
                label = []
                if label_str == 'NULL':
                    final_line = '__label__' + 'NULL' + ' ' + sent_comment + '\n'
                    out_f.write(final_line)
                else:
                    label_dict = eval(label_str)
                    flag = 0 #zh
                    for k,v in label_dict.items():
                        hit_lang = langid.classify(v["hit"])[0] # zh
                        if hit_lang != "zh": #zh
                            flag = 1  # zh
                        label.append('__label__'+v['label'])
                    tmp_label = ' '.join([x for x in label])
                    final_line = tmp_label  + ' ' + sent_comment + '\n'
                    if flag == 0: #zh
                        out_f.write(final_line)
Example #47
0
    def parse(text):

        lang, *_ = langid.classify(text)
        lines = text.strip().split('\n')

        output = []

        day, month, year, time, place, date = [None]*6

        for line in lines:
            line = line.strip()
            if line and DataParser.filter(line):
                output.append(line)

            pattern = re.compile('([0-9]+) ([A-Z]+) ([0-9]+) ([0-9]+:[0-9]+[AP]M) by PIB (.+)')
            matches = pattern.match(line)
            if matches:
                day, month, year, time, place = matches.groups()
                date_string = f'{month} {day} {year} {time}'
                date = datetime.strptime(date_string, '%b %d %Y %I:%M%p')

        content = '\n'.join(output)

        return { 
            "lang": lang, "content": content,
            "date": date, "city": place
        }
Example #48
0
def clean_train_data(train_data, min_ents=0, min_text_len=5, lang=['de']):
    """ removes items with no entities or fewer entities then min_ents
        :param train_data: A list of lists of spacy-like NER Tuple\
        [(('some text'), entities{[(15, 19, 'place')]}), (...)]
        :param min_ents: An integer defining the minimum amount of entities.
        :min_text_len: An integer defining the minimum length of the textself.
        :lang: A list of language codes. If populated, only samples matching those languages will\
        be included into the returned results.
        :return: A list of lists of spacy-like NER Tuple\
        [(('some text'), entities{[(15, 19, 'place')]}), (...)]
    """

    TRAIN_DATA = []
    for x in train_data:
        try:
            ents = x[1]
        except TypeError:
            ents = None
        if ents and len(ents['entities']) >= min_ents and len(
                x[0]) >= min_text_len:
            TRAIN_DATA.append(x)
    if len(lang) > 0:
        TRAIN_DATA_LANG = []
        for x in TRAIN_DATA:
            lng, prob = langid.classify(x[0])
            if lng in lang:
                TRAIN_DATA_LANG.append(x)
        return TRAIN_DATA_LANG

    return TRAIN_DATA
Example #49
0
def transcribe(text, lang=None, alphabet="IPA",
               syllabic_separator=u".", stress_mark=u"'", word_separator=u"|",
               auto_lang=False):
    """
    Get the phonetic transcription of `text`

    :param text: unicode string to transcribe
    :param lang: string with the ISO 639-1 code or IETF language tag of `text`
    :param alphabet: string with the name of the phonetic alphabet to use
    :param syllabic_separator: string with the syllabic separator character
    :param stress_mark: string to mark the stress in words
    :param word_separator: string with the word separator character
    :param auto_lang: boolean to perform an automatic language identification
    :return: string with the phonetic transcription of `text`
    """
    if auto_lang or not lang:
        if not langid:
            raise ImportError("Please, install langid")
        lang = langid.classify(text)[0]
    transcriptor = get_transcriptor(
        lang=lang,
        alphabet=alphabet,
        syllabic_separator=syllabic_separator,
        word_separator=word_separator,
        stress_mark=stress_mark)
    return transcriptor.transcribe(
        text=text,
        syllabic_separator=syllabic_separator,
        word_separator=word_separator,
        alphabet=alphabet,
        stress_mark=stress_mark,
    )
def clean(fileName):
    brandList = []
    listFile = open(fileName, 'r')
    for line in listFile:
        if not line.startswith('#'):
            brandList.append(line.strip())
    listFile.close()

    for brand in brandList:
        print(brand)
        userTweets = {}
        tweetFile = open('data/userTweets2/'+brand+'.json', 'r')
        for line in tweetFile:
            try:
                data = json.loads(line.strip())
            except:
                continue
            userID = data['user']
            if userID not in userTweets:
                userTweets[userID] = []
            tweets = data['statuses']
            for tweet in tweets:
                if len(tweet['text']) > 5:
                    if langid.classify(tweet['text'])[0] == 'en':
                        userTweets[userID].append(tweet)
                if len(userTweets[userID]) > 20:
                    break
        tweetFile.close()

        outputFile = open('data/userTweets2/clean2/' + brand + '.json', 'w')
        for userID, tweets in userTweets.items():
            if len(tweets) > 19:
                output = {'user_id': userID, 'statuses': tweets}
                outputFile.write(json.dumps(output)+'\n')
        outputFile.close()
Example #51
0
    def write(self, text):
     '''
     writes input to a dictionary 
     '''
    
        en_list = []                                            # create a list to handle English input 
        ru_list = []                                            # create a list for processing Russian input  
        in_list = text.split(' ')                               # list for input string 
        
        if not text:                                            

            self.lbl.setText('NONE')                           
            return None
            
        
        for word in in_list:                                    # we iterate over the entered words
        
            ch, coal = langid.classify(word)                    # ch stores the language of the word 
        
            if ch == 'en' and word !='' and word != ' ':        # if the word is in english
        
                en_list.append(word)                            # add to the list for English words 
        
            elif ch == 'ru':                                    # if the word is in Russian 
        
                ru_list.append(word)                            # add to the list for Russian words
                
        if len(en_list) > 1 or '-' in text:                     # if in English not one word, but a phrase or a complex word with a hyphen 
            
            str_en = ' '.join(str(e) for e in en_list)
            str_ru = ' '.join(str(e) for e in ru_list)
            self.di_ten[str_en]:[] = str_ru, '{:%Y-%m-%d %H:%M:%S}'.format(datetime.now())    # date added to the list of dictionary values 
                                                                                                for possible further sorting 
Example #52
0
def prediction(text):
    if os.path.exists(model_location):
        model = joblib.load(model_location)
    else:
        model = model_training()
    # lang_detected = detect(text)
    lang_detected = classify(text)[0]
    print(text)
    prediction = model.predict(
        laser.embed_sentences([text], lang=lang_detected))
    probability = model.predict_proba(
        laser.embed_sentences([text], lang=lang_detected))
    probability[0].sort()
    max_probability = max(probability[0])
    # if (max_probability-0.35) > probability[0][-2]:
    if max_probability > 0.63:
        pred_output = prediction[0]
    else:
        pred_output = 'None'
    print('{}-------------->{}'.format(max(probability[0]), pred_output))
    return ({
        'probability': max(probability[0]),
        'output': pred_output,
        'actual_output': prediction[0]
    })
 def is_language(self, s, expected_lang):
     """ Check if the language of the segment cannot be reliably identified
     as another language. If another than the expected language is
     detected return False """
     expected_lang = expected_lang.lower()
     if self.valid_languages:
         assert expected_lang in self.valid_languages
     if self.use_cld2:
         reliable, _text_bytes, details = cld2.detect(
             s.encode("utf-8"),
             isPlainText=True,
             useFullLangTables=True,
             bestEffort=True)
         if reliable:
             for _lang, langcode, confidence, score in details:
                 if langcode == expected_lang and confidence >= 10:
                     return True
             return False
         else:  # unreliable is still counted as OK
             return True
     else:
         lang, confidence = langid.classify(source.lower())
         if lang != expected_lang and confidence > 0.9:
             # confidence for wrong language higher than 90%
             return False
         else:
             return True
def check_german(tweet_text):

    if isinstance(tweet_text, unicode) is False:
        tweet_text = unicode(tweet_text, 'utf-8')
        tweet_text = tweet_text.encode('utf-8')
    tokens = tweet_text.split(' ')
    new_text = ''
    #delete @username
    for token in tokens:
        if '@' not in token:
            new_text += token + ' '
    new_text = new_text.lower()
    if isinstance(new_text, unicode) is False:
        text = unicode(new_text, 'utf-8')
        text = text.encode('utf-8')
    else:
        text = new_text.encode('utf-8')
    top_language_name = cld.detect(text)
    lang_form_langid = langid.classify(text)
    if new_text == '':
        return True
    #if text empty - german
    if top_language_name[0] == 'GERMAN' or lang_form_langid[0] == 'de':
        return True
    else:
        return False
Example #55
0
 def on_data(self, data):
     if time.time() >= self.started + self.duration:
         stats = open('{0}-sample.stats'.format(int(self.started)), 'w+')
         stats.write("================= STATISTICS =================" + "\n")
         stats.write("Start time: " + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(self.started)) + "\n")
         stats.write("End time: " + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + "\n")
         stats.write("First Tweet ID: " + self.first_tweet_id + "\n")
         stats.write("Last Tweet ID: " + self.last_tweet_id + "\n")
         stats.write("Language: " + self.lang + "\n")
         stats.write("Language classification threshold: " + str(self.lang_threshold) + "\n")
         stats.write("Above threshold: " + str(self.counter[self.lang + '-above']) + "\n")
         stats.write("Below threshold: " + str(self.counter[self.lang + '-below']) + "\n")
         stats.write("Exluded: " + str(self.counter['excluded']) + "\n")
         return False
     elif 'in_reply_to_status_id' in data: 
         status = Status.parse(self.api, json.loads(data))
         langclass = langid.classify(status.text)
         
         if (self.counter == {self.lang + '-above':0, self.lang + '-below':0, 'excluded':0}):
             self.first_tweet_id = str(status.id)
         self.last_tweet_id = str(status.id)
         
         if (langclass[0] == self.lang):                
             if langclass[1] >= self.lang_threshold:
                 self.above_output.write(data)
                 self.counter[self.lang + '-above'] += 1
             else:
                 self.below_output.write(data)
                 self.counter[self.lang + '-below'] += 1
         else:
             self.excl_output.write(data)
             self.counter['excluded'] += 1
            
         return True
Example #56
0
    def gettinglan(self,text):

        lanre=langid.classify(text)
        language= lanre[0]
        certainty=lanre[1]

        return language,certainty
Example #57
0
def extract_tweet(filename, min_num_chars=10):
    with open(filename, 'r') as tweetfile:
        for tweet in tweetfile:            
            tweettext = parser.get_tweet_text(tweet,remove_urls=True, remove_retweets=True, remove_usernames=True)
            if(len(tweettext) > min_num_chars):
                tweetlang = langid.classify(tweettext)
                print tweettext[0:50] if len(tweettext) > 50 else  tweettext , " :    ", tweetlang , '\n'
Example #58
0
 def apply(self, text, evaluation):
     'LanguageIdentify[text_String]'
     import langid  # see https://github.com/saffsd/langid.py
     # an alternative: https://github.com/Mimino666/langdetect
     import pycountry
     code, _ = langid.classify(text.get_string_value())
     language = pycountry.languages.get(iso639_1_code=code)
     return String(language.name)
Example #59
0
def filter_language (df):
	df['lang'] = ''
	df['prob'] = ''

	for index, row in df.iterrows():
		df['lang'][index], df['prob'][index] = langid.classify(row['text'])

	return df[df.lang == 'en']
Example #60
0
 def parse_article(self, url, html):
     rdoc = readability.Document(html)
     summary = rdoc.summary()
     lang_id, _ = langid.classify(summary)
     article = newspaper.Article(url, config=self.config, language=lang_id)
     article.set_html(html)
     article.parse()
     return article