Esempio n. 1
0
 def detect(text):
     prediction, confidence, isreliable, proportion = cld3.get_language(
         text)
     if (confidence >= 0.5):
         return prediction
     else:
         return cld3.get_language(clean(text))[0]
def get_misclassification_stats(dataset):
    total_tweets = 0
    misclassified_cld = 0
    misclassified_langid = 0
    for index, t in enumerate(dataset):
        if len(t) > 0:
            #print(index, t)
            # Remove mentions, URLs, and hashtags
            t = re.sub(r"(?:\@|\#|https?\://)\S+", "", t)
            t = " ".join(t.split()[:8])

            if len(t) > 0:

                cld_prediction = cld3.get_language(t)
                langid_prediction = langid.classify(t)

                if cld_prediction[0] != "en":
                    # print(t, cld_prediction[0])
                    misclassified_cld += 1
                if langid_prediction[0] != "en":
                    # print(t, langid_prediction[0])
                    misclassified_langid += 1

                total_tweets += 1

    print("CLD accuracy: ", total_tweets - misclassified_cld, "/",
          total_tweets, "=", (total_tweets - misclassified_cld) / total_tweets)
    print("Langid accuracy: ", total_tweets - misclassified_langid, "/",
          total_tweets, "=",
          (total_tweets - misclassified_langid) / total_tweets)
Esempio n. 3
0
def find_new_links(htmlstring,
                   base_url,
                   known_links,
                   language=None,
                   rules=None):
    """Extract and filter new internal links after an optional language check."""
    new_links = []
    # reference=None
    # optional language check: run baseline extraction + language identifier
    if language is not None and LANGID_FLAG is True:
        _, text, _ = baseline(htmlstring)
        result = cld3.get_language(text)
        if result is not None and result.language != language:
            return new_links, known_links
    # iterate through the links and filter them
    for link in extract_links(htmlstring,
                              base_url,
                              False,
                              language=language,
                              with_nav=True):
        # check robots.txt rules
        if rules is not None and not rules.can_fetch("*", link):
            continue
        # sanity check
        if is_known_link(link, known_links) is True or is_not_crawlable(link):
            continue
        new_links.append(link)
        known_links.add(link)
    return new_links, known_links
Esempio n. 4
0
    def parse_item(self, response):
        article = response.css('section.article-column > div.article-text')

        heading = article.css('h1::text')

        date = article.css('div.item.time::text')

        sub_heading = article.css('div.like-h2::text').extract_first()
        if not sub_heading:
            sub_heading = article.css('h2::text').extract_first()

        text = article.css('div').xpath('p//text()').extract()

        if sub_heading:
            body = [sub_heading] + text
        else:
            body = text

        body = ' '.join(body)
        lang = cld3.get_language(body).language

        tags = article.css('div.tags > a::text')

        yield {
            'heading': heading.extract_first(),
            'date': date.extract_first(),
            'tags': tags.extract(),
            'lang': lang,
            'body': body
        }
Esempio n. 5
0
def convert_warc_to_csv(arguments):
    counter = 0
    with open(arguments["input_path"], 'rb') as input_file, \
         open(arguments["output_path"], "w", newline='') as output_file:

        writer = csv.writer(output_file, delimiter=',', quotechar='"')

        for record in ArchiveIterator(input_file):

            if record.rec_type == 'response':
                if record.http_headers.get_header(
                        'Content-Type') == 'text/html':
                    html = record.content_stream().read()
                    clean_text = clean_html(html)

                    if len(clean_text) > 0:
                        language_prediction = cld3.get_language(clean_text)
                        if language_prediction.language == arguments[
                                "language"]:
                            writer.writerow([clean_text, language_prediction])
                            counter += 1

                            if counter >= int(arguments["count"]):
                                return

                            if counter % 100 == 0:
                                logger.info("Saved " + str(counter) +
                                            " websites")
 def _cld3_detection(self, doc: AnyStr) -> (AnyStr, float):
     language_detection_object = cld3.get_language(doc)
     lang_id = language_detection_object.language[:2]
     for original_code, new_code in LANGUAGE_REMAPPING.items():  # make cld3 compatible with langid
         lang_id = lang_id.replace(original_code, new_code)
     lang_probability = float(language_detection_object.probability)
     return (lang_id, lang_probability)
Esempio n. 7
0
def get_lange_cld(text, get_prob=False):
    output = cld3.get_language(text)
    lang = output.language

    if get_prob == False:
        return lang
    else:
        return lang, output.probability
Esempio n. 8
0
 def _cld3_detection(self, doc: AnyStr) -> (AnyStr, float):
     """Detect the language of a string using the `cld3` library"""
     language_detection_object = cld3.get_language(doc)
     lang_id = language_detection_object.language[:2]
     for original_code, new_code in LANGUAGE_REMAPPING_PYCLD3_LANGID.items(
     ):  # make cld3 compatible with langid
         lang_id = lang_id.replace(original_code, new_code)
     lang_probability = float(language_detection_object.probability)
     return (lang_id, lang_probability)
Esempio n. 9
0
def language_filter(temp_text, temp_comments, target_language, docmeta):
    '''Run external component (if installed) for language identification'''
    # sanity check on language
    if target_language is not None:
        if LANGID_FLAG is True:
            # comments
            if len(temp_comments) > len(temp_text):
                result = cld3.get_language(temp_comments)
            # default
            else:
                result = cld3.get_language(temp_text)
            if result.language != target_language:
                LOGGER.warning('wrong language: %s %s %s', result,
                               docmeta['id'], docmeta['url'])
                return True
        else:
            LOGGER.warning('Detector not installed, no language detection run')
    return False
Esempio n. 10
0
def _validate_language(text, language):
    if language not in CLD3_LANG_CODES:
        return True

    lang_res = cld3.get_language(text)
    if lang_res.is_reliable and lang_res.language == language:
        return True

    return False
Esempio n. 11
0
def detect():
    content = request.get_json()
    sentence = content['sentence']
    prediction = cld3.get_language(sentence)
    return jsonify({
        "lang": prediction.language,
        "probability": prediction.probability,
        "is_reliable": prediction.is_reliable
    })
Esempio n. 12
0
    def _detect_language(self, text):
        """Tries to detect the language of a text input. Outputs a BCP-47-style
        language code (e.g. 'en')."""

        lan_info = cld3.get_language(text)
        if lan_info is not None and lan_info.is_reliable:
            return lan_info.language
        else:
            return None
Esempio n. 13
0
def remove_mixed_language_items(samples):
    to_be_removed = []
    for i, item in enumerate(samples):
        language_info = cld3.get_language(item['text'])
        if language_info.probability < 0.95:
            to_be_removed.append(i)
    for i in reversed(range(len(to_be_removed))):
        idx = to_be_removed[i]
        del samples[idx]
    return samples
Esempio n. 14
0
    def detect_lang(self, text):
        src_lang, _, is_reliable, _ = cld3.get_language(text)

        if not is_reliable:
            os.system(
                'say "Not certain which language that is. Please decide."')
            print("Text:\n{}".format(text))
            src_lang = input(
                "Please enter source language abbreviation after scheme in CLD3 github:"
            )

        return src_lang
Esempio n. 15
0
def load_covid_data():
    with open('covid.csv') as csvfile:
        tip_line_requests = csv.reader(csvfile)
        tip_line_requests = [item for item in tip_line_requests]
    csv_headers = tip_line_requests[0]
    tip_line_requests = tip_line_requests[1:]

    temp_tip_line_requests = []
    for row in tip_line_requests:
        item = {}
        for i, key in enumerate(csv_headers):
            item[key] = row[i]
        temp_tip_line_requests.append(item)
    tip_line_requests = temp_tip_line_requests

    tip_line_requests = [
        tip for tip in tip_line_requests if tip['claim_type'] == 'Claim'
    ]
    for tip in tip_line_requests:
        tip['text'] = remove_emoji(
            tip['media_text'] if tip['media_text'] != 'NA'
            and len(tip['media_text']) >= len(tip['media_title']) else
            tip['media_title'])
        lang_data = cld3.get_language(tip['text'])
        if lang_data is not None:
            tip['language'] = lang_data.language
    tip_line_requests = [
        tip for tip in tip_line_requests if tip['text'] != 'NA'
        and not tip['text'].isspace() and 'language' in tip
    ]

    partners = set([item['team_slug'] for item in tip_line_requests])
    temp_tip_line_requests = {}
    for partner in partners:
        partner_tips = [
            item for item in tip_line_requests if item['team_slug'] == partner
        ]
        temp_tip_line_requests[partner] = {
            lang: []
            for lang in partner_languages[partner]
        }
        for tip in partner_tips:
            if tip['language'] in partner_languages[partner]:
                tip['embedding'] = get_sentence_embedding(
                    tip['text'], tip['language'])
                temp_tip_line_requests[partner][tip['language']].append(tip)
        for language in partner_languages[partner]:
            temp_tip_line_requests[partner][
                language] = remove_duplicate_requests(
                    temp_tip_line_requests[partner][language])

    tip_line_requests = temp_tip_line_requests
    return partners, tip_line_requests
Esempio n. 16
0
 def get_lang(self):
     if self.successfully_read and self._lang is None:
         #Extracting actual content of the page and checking language
         utf_text_to_deboilerpipe = re.sub(r'<?xml.*encoding.*?>',
                                           '<?xml version="1.0"?>',
                                           self.utf_text)
         try:
             article = alcazar.bodytext.parse_article(
                 utf_text_to_deboilerpipe)
             if article.body_text:
                 self._lang = cld3.get_language(article.body_text)
         except:
             self._lang = None
     return self._lang
Esempio n. 17
0
def group_tiplines_by_language(tip_line_requests, languages=['en', 'pt', 'hi', 'hi-Latn', 'mr', 'bn', 'ta', 'te', 'ml']):
    for tip in tip_line_requests:
        tip['text'] = remove_emoji(
            tip['media_text'] if tip['media_text'] != 'NA' and len(tip['media_text']) >= len(tip['media_title']) else
            tip['media_title'])
        lang_data = cld3.get_language(tip['text'])
        if lang_data is not None:
            tip['language'] = lang_data.language
    tip_line_requests = [tip for tip in tip_line_requests if tip['text'] != 'NA' and not tip['text'].isspace() and 'language' in tip and len(tip['text']) > 20]

    temp_tip_line_requests = {}
    for language in languages:
        temp_tip_line_requests[language] = [item for item in tip_line_requests if item['language'] == language]

    tip_line_requests = temp_tip_line_requests
    return tip_line_requests
Esempio n. 18
0
def get_summary(text: str,
                percentage: float = None,
                abstractive: bool = False):
    if get_language(text).language != 'hi':
        raise HTTPException(status_code=418,
                            detail="Summarization only available for Hindi.")

    summary = Summary(text, percentage, abstractive)
    response_length = len(summary)
    original_length = len(text)

    return {
        "summary": summary,
        "response_length": response_length,
        "original_length": original_length
    }
Esempio n. 19
0
def detect_lang_neural(text,
                       return_multiple=False,
                       return_dict=False,
                       hint_language=None,
                       filter_unreliable=False):
    if cld3 is None:
        LOG.debug("run pip install pycld3")
        raise ImportError("pycld3 not installed")
    languages = []
    if return_multiple or hint_language:
        preds = sorted(cld3.get_frequent_languages(text, num_langs=5),
                       key=lambda i: i.probability,
                       reverse=True)
        for pred in preds:
            if filter_unreliable and not pred.is_reliable:
                continue
            if return_dict:
                languages += [{
                    "lang_code": pred.language,
                    "lang": code_to_name(pred.language),
                    "conf": pred.probability
                }]
            else:
                languages.append(pred.language)

            if hint_language and hint_language == pred.language:
                languages = [languages[-1]]
                break
    else:
        pred = cld3.get_language(text)
        if filter_unreliable and not pred.is_reliable:
            pass
        elif return_dict:
            languages = [{
                "lang_code": pred.language,
                "lang": code_to_name(pred.language),
                "conf": pred.probability
            }]
        else:
            languages = [pred.language]

    # return top language only
    if not return_multiple:
        if not len(languages):
            return None
        return languages[0]
    return languages
Esempio n. 20
0
def song_to_lines(song, min_lines=20, acceptable_languages=["en"]):
    """
    takes a song and returns an array where
    each line is an element
    """
    if "lyrics" in song and song["lyrics"]:
        lines = song["lyrics"].lower().split("\n")
        if len(lines) >= min_lines:
            lang_prediction = cld3.get_language(song["lyrics"])
            if (lang_prediction.is_reliable
                    and lang_prediction.language in acceptable_languages):
                return lines
            else:
                return []
        else:
            return []
    else:
        return []
Esempio n. 21
0
    def parse_news(self, response):
        self.log(response.body)
        article = response.css('div.article-content')

        heading = article.css('div.title > h1::text')
        date = article.css('div.title > div > span::text')

        text = article.css('span._ga1_on_').xpath('p//text()')
        body = ' '.join(text.extract())
        lang = cld3.get_language(body).language

        tags = response.css('div.article-content > div.tag > a::text')

        yield {
            'heading': heading.extract_first(),
            'date': date.extract_first(),
            'lang': lang,
            'tags': tags.extract(),
            'body': body
        }
Esempio n. 22
0
    def parse_article(self, response):
        article = response.css('main.main-col > div.main-col__left')

        heading = article.css(
            'div.news-full__head > h1.news-full__title::text')
        date = article.css('div.news-full__head > time.news-full__date::text')

        text = article.css('div.news-full__text').xpath('p//text()')
        tags = article.css('div.news-full-tags > div > span::text')

        body = ' '.join(text.extract())
        lang = cld3.get_language(body).language

        yield {
            'heading': heading.extract_first(),
            'date': date.extract_first(),
            'lang': lang,
            'tags': tags.extract(),
            'url': response.request.url,
            'body': body
        }
Esempio n. 23
0
 def run(self):
     print("Connected: " + str(self.address))
     while True:
         try:
             request = self.socket.recv(4096)
             request = request.decode('utf-8')
             if not request:
                 break
             print("Text in: " + request)
             result = str(cld3.get_language(request))
             print("Result: " + result)
             client.send(str.encode(result))
             print()
             #lock.release()
         except socket.timeout as e:
             print(str(e))
             break
         except UnicodeDecodeError as e:
             print("Decode Error")
             client.send(str.encode("X"))
     self.socket.close()
     print("Disconnect")
Esempio n. 24
0
    def save(self, *args, **kwargs):
        if isinstance(self.title, str) and len(self.title) is 0:
            self.title = None
        if isinstance(self.body, str) and len(self.body) is 0:
            self.body = None
        if not self.sketch or kwargs.pop('generate', False):
            if self.body:
                self.html = generate.html(self.body)
                self.raw = generate.raw(self.html)

        if self.raw and kwargs.pop('update_lead', True):
            self.lead = generate.lead(self.raw)

        if self.title and kwargs.pop('update_slug', True):
            self.slug = generate.slug(self.title, uuid=self.uuid)

        if not self.published_at and not self.sketch:
            self.published_at = timezone.now()

        if not self.sketch or kwargs.pop('update_language', False):
            if self.body and len(self.body) > self.LANG_MIN_LEN:
                cleaned = generate.clean(self.body)
                result = cld3.get_language(cleaned)
                logger.debug(
                    f'NarrativeTranslation language classification results: {result}'
                )
                if result.is_reliable:
                    language = result.language.split('-')[0][:5]
                    self.language = language
                else:
                    logger.debug(
                        f'NarrativeTranslation failed language classification.'
                    )

            else:
                logger.debug(
                    f'NarrativeTranslation skipped language classification.')

        super().save(*args, **kwargs)
Esempio n. 25
0
    def test_get_language(self):
        self.assertIsNone(cld3.get_language(""))
        self.assertIsNone(cld3.get_language(None))
        self.assertEqual(
            cld3.get_language("影響包含對氣候的變化以及自然資源的枯竭程度").language,  # noqa
            "zh",
        )
        self.assertEqual(
            cld3.get_language("This is a test").language,
            "en",
        )

        res = cld3.get_language("وفي وقت سابق اليوم السبت قالت الرئاسة المصرية -في بيان- إنها تتطلع لقيام الولايات المتحدة بدور فعال، خاصة في ضوء وصول المفاوضات بين الدول الثلاث لطريق مسدود.")  # noqa
        self.assertEqual(res.language, "ar")

        res = cld3.get_language("مغلوں کی خام اور سفید و سیاہ میں تصویر کشی دراصل مودی کی دائیں بازو والی بی جے پی حکومت کے اقتدار میں بھارتی مسلمانوں سے روا رکھے جانے سلوک کو درست ٹھہرانے کی کوشش کے سوا کچھ نہیں۔ ")  # noqa
        self.assertEqual(res.language, "ur")
Esempio n. 26
0
def group_tiplines_by_language(
        tip_line_requests,
        languages=['en', 'pt', 'hi', 'mr', 'bn', 'ta', 'te', 'ml']):
    for tip in tip_line_requests:
        tip['text'] = remove_emoji(
            tip['media_text'] if tip['media_text'] != 'NA'
            and len(tip['media_text']) >= len(tip['media_title']) else
            tip['media_title'])
        lang_data = cld3.get_language(tip['text'])
        if lang_data is not None and lang_data.probability >= 0.95:
            tip['language'] = lang_data.language
    tip_line_requests = [
        tip for tip in tip_line_requests if tip['text'] != 'NA'
        and not tip['text'].isspace() and 'language' in tip and (
            60 <= len(tip['text']) <= 1200) and not contains_url(tip['text'])
        and not contains_phone_number(tip['text'])
    ]

    return [{
        'text': item['text'],
        'language': item['language'],
        'source': SourceName.TIPLINE.value
    } for item in tip_line_requests if item['language'] in languages]
Esempio n. 27
0
def get_lid(line, threshold = 150):

	#CLD2
	if len(line) > threshold:
		try:
			isReliable, textBytesFound, details = cld2.detect(line, isPlainText = True)
			code_cld2 = details[0][1]
			code_cld2 = mapping_dict[code_cld2]
		except:
			code_cld2 = "ukn"
			
	#CLD3
		try:
			prediction = cld3.get_language(line)
			code_cld3 = prediction[0]
			code_cld3 = mapping_dict[code_cld3]
		except:
			code_cld3 = "ukn"
	else:
		code_cld2 = "ukn"
		code_cld3 = "ukn"

	return code_cld2, code_cld3
Esempio n. 28
0
 def detect(self, query):
     lang_prediction = cld3.get_language(query)
     return lang_prediction
Esempio n. 29
0
import csv
import time

import cld3

with open('lang_detect_test.csv', 'r', encoding='utf-8', newline='') as f:
    rdr = csv.reader(f, delimiter=',')

    point = 0
    lines = 0
    start_time = time.time()

    for line in rdr:
        lines += 1
        lang = line[0]

        detected = cld3.get_language(line[1])

        if lang == detected.language:
            point += 1
        else:
            print(
                f'{line[1]}. expected = {lang}, result = {detected.language}.')

    accuracy = (point / lines) * 100
    print(f'accuracy = {accuracy}% . elapsed={time.time() - start_time}')
Esempio n. 30
0
]

#for i in kcu:
#cnt+=1
fsn = "CoreBotTweetsCombinedEN.csv"
#dff = pd.DataFrame([["The", " Core Bot UserID", " is:", " "+str(i)]], columns=["tweetid", "tweet_text", "hashtags", "urls"])
#dff[["tweet_text"]] = dff["tweet_text"].apply(translator.translate, dest='en').apply(getattr, args=('text',))
#dff[["tweetid", "tweet_text", "hashtags", "urls"]].to_csv(fsn, mode='a', header=i, index=False)
for df_ in dfn:
    #translators = Translator(to_lang='en', from_lang='ru')
    df_lst.iloc[0:0]
    #gs = goslate.Goslate()
    t0 = time.time()

    df_lst = df_.loc[df_["userid"].map(lambda x: x is not None)]
    for z in eng:
        df_lst["Yes"] = df_["tweet_text"].apply(
            lambda x: "true" if str(z).lower() in str(x).lower(
            ) and cld3.get_language(str(x)).language == 'en' else "false")
    df_lst = df_lst.loc[df_lst["Yes"].map(lambda x: x == "true")]
    #pdb.set_trace()
    #print(df_.tweet_text)
    #print(df_["tweet_text"].apply(lambda x: translators.translate(x)))
    #df_lst["tweet_text"] = df_lst.tweet_text.apply(lambda x: gs.translate(x, 'en')
    df_lst.insert(14, "language", "en")
    df_lst = df_lst.dropna(subset=["tweet_text"])
    df_lst[["tweetid", "userid", "tweet_text", "hashtags", "urls",
            "language"]].to_csv(fsn, mode='a', header=False, index=False)
    t1 = time.time()
    print(t1 - t0)