Beispiel #1
0
    def extract_webpage(self, url: str, options: dict = {}) -> ExtractResponse:
        """
        Extracts text + metadata from a url pointing to a HTML
        """
        # session = requests.Session()
        # TODO support auth/login through  options {}
        # response = session.get(url)

        # https://newspaper.readthedocs.io/en/latest/

        article = newspaper.Article(url=url)
        article.download()
        article.parse()
        text = article.text

        meta = {}
        language = detect_language(text) or "en"
        doc_class = "article"
        mediatype = "text/html"

        response = ExtractResponse(
            **{
                "sourceUrl": url,
                "language": language,
                "text": text,
                "document_class": doc_class,
                "mediatype": mediatype,
                "metadata": meta,
            }
        )

        return response
    def _getSpacyModelName(self):
        if not self.language or self.language.lower() == "detect":
            # Detect most probable laguage code from input text
            self.language = detect_language(self.text)[:2].lower()
        if not self.model or self.model.lower() == "default":
            if self.language == "en":
                self.model = "core_web_sm"
            else:
                self.model = "core_news_sm"

        # At this point we should have a valid, two-char language code, plus a 'model' shortname.
        # For default spaCy languages, just join them together, e.g. "de_core_news_sm"
        # TODO for our specialized languages, we may do something differently here (e.g. resolve to a path)
        name = f"{self.language}_{self.model}"
        return name
Beispiel #3
0
    def __init__(self, subtitles_file, subtitles_language: str,
                 audio_language: str):
        """
        Constructor for the SubtitlesParser class.

        Params:
            subtitles_file (FileStorage): File with the subtitles loaded.
            subtitles_language (str): The language of the subtitles.
            audio_language (str): The language to get the hot words in (the audio language).
        """

        subtitles_binary = subtitles_file.read()
        encoding = detect_encoding(subtitles_binary)['encoding']
        self.subtitles = subtitles_binary.decode(encoding)
        self.encoding = encoding
        logger.debug(f'Subtitles Encoding: {encoding}')
        logger.debug(f'Subtitles[:100]: {[self.subtitles[:1000]]}')
        self.read_subtitles()
        self.audio_language = audio_language

        # Detect subtitles language
        if (subtitles_language == 'ad'):  # ad = Auto Detect
            detected_language = detect_language(self.subtitles)
            logger.debug(f"Subtitles language detected as {detected_language}")

            # True if the detected language in google's supported languages
            language_items = any(
                map(lambda lang: lang['language'] == detected_language,
                    Constants.GOOGLE_LANGUAGES))
            if (language_items is False):
                raise Exception(
                    f'Detected language {detected_language} is not supported.')
            self.subtitles_language = detected_language

        else:
            self.subtitles_language = subtitles_language

        self.translator = CustomTranslator(self.subtitles_language,
                                           audio_language)
Beispiel #4
0
def main():
    profiles = ld.create_languages_profiles()
    allLanguages = defaultdict(lambda: [])

    output_file = sys.argv[2] if len(sys.argv) == 3 else (
        os.path.splitext(os.path.basename(sys.argv[1]))[0] + ".json")
    with open(sys.argv[1], 'r') as commentsFile:
        allComments = commentsFile.readlines()

        for comment in allComments:
            languages = ld.detect_language(comment, profiles)
            for ordrer, (language, accuracy) in enumerate(languages):
                allLanguages[language].append((comment, accuracy, ordrer))

    jsonResult = json.dumps(allLanguages)

    if not os.path.exists('../outLangDetect'):
        os.makedirs('../outLangDetect')

    with open('../outLangDetect/' + output_file, 'w') as outFile:
        outFile.write(jsonResult)
    sys.stdout.write(jsonResult)
Beispiel #5
0
def test_data(profiles, dir_path):
    right = 0
    wrong = 0
    for language in TESTING_LANGUAGES:
        path = dir_path + language + "/"
        for filename in os.listdir(path):
            with open(path + filename, 'r') as f:
                text = f.read()
                f.close()
                try:
                    results = ld.detect_language(text, profiles)
                    detected_language = results[0][0]

                    if detected_language == language:
                        right += 1
                    else:
                        wrong += 1

                except ZeroDivisionError:
                    print "ERROR: " + path + filename

    return right, wrong
Beispiel #6
0
def test_data(profiles, dir_path):
    right = 0
    wrong = 0
    print "Test in %s" % dir_path
    print "Language, total, precision"
    for language in TESTING_LANGUAGES:
        print "Language:", language

        lang_right = 0
        lang_wrong = 0
        path = dir_path + language + "/"
        for filename in os.listdir(path):
            with open(path + filename, 'r') as f:
                text = f.read()
                f.close()
                try:
                    results = ld.detect_language(text, profiles)
                    detected_language = results[0][0]

                    if detected_language == language: lang_right += 1
                    else: lang_wrong += 1

                except ZeroDivisionError:
                    print "ERROR: %s" % path + filename

        right += lang_right
        wrong += lang_wrong

        print "Right: %s, Wrong: %s Total: %s" % (
            str(lang_right), str(lang_wrong), str(lang_right + lang_wrong))
        print "Accuracy: %s " % str(
            lang_right / float(lang_right + lang_wrong) * 100)
        # print "%s, %s, %.2f" % (TESTING_LANGUAGES[language], str(lang_right + lang_wrong), lang_right / float(lang_right + lang_wrong) * 100)

    print "======TOTAL:"
    print "\tRight: %s, Wrong: %s Total: %s" % (str(right), str(wrong),
                                                str(right + wrong))
    print "\tPrecision: %s \n" % str(right / float(right + wrong) * 100)