Ejemplo n.º 1
0
def detectLanguage():
	"""!
	Function is detecting language (is it croatian or alike) of a newly created commentsFile.
	"""
	print "Analyzing " + str( len(cFiles) + len(pyFiles) ) + " files. This may take a while."
	
	failed = []

	with codecs.open(commentsFile, "r", encoding='utf8') as f:
	    lineNum = 1
	    for line in f:
	        lineNum += 1
	    	if len(line) < 10 or "author" in line:
	    		pass
	    	else:
		        try:
	        		if detect(line.encode('utf-8')) == 'hr' or detect(line.encode('utf-8')) == 'sl':
	        			failed.append(line)
	        	except:
	        		pass

	print "Found " + str(len(failed)) + " suspicious comments: "
	for fail in failed:
		print fail

	print "Finding suspicious comments...done."


	# erase contents of the comments file?
	# maybe delete the file?
	open(commentsFile, 'w').close()
def get_youtube_comments(link):
    o = urlparse(link)
    query = o.query.split('&')
    videoID = query[0].replace('v=', '')
    youtube = get_authenticated_service(videoID)

    # All the available methods are used in sequence just for the sake of an example.
    text = ''
    video_comment_threads = get_comment_threads(youtube, videoID)
    for thread in video_comment_threads:
        topComment = thread["snippet"]["topLevelComment"]

        cmt = topComment["snippet"]["textDisplay"] + '\n'
        if detect(cmt) == 'en':
            text += cmt

        parent_id = thread["id"]
        video_comments = get_comments(youtube, parent_id)

        for child_comments in video_comments:
            cmt = child_comments["snippet"]["textDisplay"] + '\n'
            try:
                if detect(cmt) == 'en':
                    text += cmt
            except:
                text += ''
    return text
def get_lang(article_str):
    lang = "not_detected"
    try:
        lang = detect(article_str)
    except UnicodeDecodeError:
        lang = detect(article_str.decode("UTF-8"))
    except:
        "Not Detected = " + article_str
    return lang
Ejemplo n.º 4
0
 def on_success(self, data):
     if ('text' in data) and (detect(data['text'])=='en'):
         rmLinkRegex = re.sub(r"(?:\@|https?\://)\S+", '', data['text'], flags=re.MULTILINE)
         rmNonAscii = re.sub(r'[^\x00-\x7F]+',' ', rmLinkRegex)
         print(rmNonAscii)
         print('\n')
         print('*****',detect(data['text']),"\n")
         getSpeech(rmNonAscii)
         time.sleep(1)
def detectEmailLanguage(m):
    b = m['body'].decode("utf-8", "ignore")
    s = m['subject']
    try:
        return detect(b)
    except:
        try:
            return detect(s)
        except:
            return "en"
    def _language(self, item):
        """Returns the language of the extracted article by analyzing metatags and inspecting the visible text
        with langdetect"""

        response = item['spider_response'].body
        root = html.fromstring(response)

        # Check for lang-attributes
        lang = root.get('lang')

        if lang is None:
            lang = root.get('xml:lang')

        # Check for general meta tags
        if lang is None:
            meta = root.cssselect('meta[name="language"]')
            if len(meta) > 0:
                lang = meta[0].get('content')

        # Check for open graph tags
        if lang is None:
            meta = root.cssselect('meta[property="og:locale"]')
            if len(meta) > 0:
                lang = meta[0].get('content')

        # Look for <article> elements and inspect the one with the largest payload with langdetect
        if lang is None:
            article_list = []
            for article in root.xpath('//article'):
                article_list.append(re.sub(r'\s+', ' ', article.text_content().strip()))
                if len(article_list) > 0:
                    lang = detect(max(article_list))

        # Analyze the whole body with langdetect
        if lang is None:
            try:
                lang = detect(root.text_content().strip())
            except LangDetectException:
                pass

        # Try to normalize output
        if lang is not None:
            # First search for suitable locale in the original output
            matches = self.langcode_pattern.search(lang)
            if matches is not None:
                lang = matches.group(0)
            else:
                # If no match was found, normalize the original output and search again
                normalized = locale.normalize(re.split(r'\s|;|,', lang.strip())[0])
                matches = self.langcode_pattern.search(normalized)
                if matches is not None:
                    lang = matches.group(0)

        return lang
Ejemplo n.º 7
0
    def check(wb, tb):
        if len(wb[0]) <= 1 or len(wb[1]) <= 2:
            return False
        try:
            if detect(wb[0]) != "ar" or detect(wb[1]) != "ar":
                return False
        except:
            return False

        if tb in [("NN", "NN"), ("NN", "DTNN"), ("NNP", "NNP")]:
            return True
        return False
Ejemplo n.º 8
0
def get_words(text):
    if not text.decode('utf-8'):
        return None
    if not (detect(text.decode('utf-8')) == u'en'):
        print detect(text.decode('utf-8'))
        return None
    
    text = text.replace(r',./\“”!@#$%^&*()-\'"+=`~:;?><', ' ')
    words = [porter.stem(word) for word in word_tokenize(text)
             if word.isalpha()
             and len(word) >= 3
             and word not in stopwords.words('english')]
    return words
Ejemplo n.º 9
0
def detect_langs(corpus):
    global langs
    import langdetect

    for doc in corpus.view_contexts(corpus.context_types[-1], as_strings=True):
        lang = langdetect.detect(' '.join(doc))
        return [lang]
def findPosts(user):
    posts = []
    # TWITTER
    if('twitterId' in user):
        CONSUMER_KEY = 'p05WZVs4JivX4a0WSwFyMXXCo'
        CONSUMER_SECRET = 'DghsY9Dxn2X8xAjdQKEvwBLtqsHNJabFz361pz2ZvRmAgXiPHB'
        ACCESS_KEY = '167813147-LwEOQAqO6RCnK0GfIEXNeVOng93QHkW1iFuVjBUV'
        ACCESS_SECRET = 'kpzp3quxTmVpSfWdgcyN5qbrPTmyoFArdvJeUC4Dfjtg1'

        twitter = Twython(CONSUMER_KEY,CONSUMER_SECRET,ACCESS_KEY,ACCESS_SECRET)
        user_timeline = twitter.get_user_timeline(screen_name=user['twitterId'], count=100, include_retweets=False)
    

        for tweet in user_timeline:
            tweet_utf = removeAccents(unicode(tweet['text'].encode('utf-8')))
            if(detect(tweet_utf) == 'pt'):
                tweet_id = unicode(str(tweet['id']))
                if(tweet_id and tweet_utf):
                    posts.append((tweet_id, tweet_utf, u'Post', u'Twitter'))            
    # FACEBOOK                
    if('facebookId' in user):
        app_id = "704203256284579"
        app_secret = "9a75ef350e4f9b24d8be454abf29ae68"
        access_token = facebook.GraphAPI().get_app_access_token(app_id, app_secret)
        graph = facebook.GraphAPI(access_token)
        profile = graph.get_object(user['facebookId'])
        f_posts = graph.get_connections(profile['id'], 'posts')

        for f_post in f_posts['data']:
            if 'message' in f_post and len(f_post['message']) > 10:
                posts.append((f_post['id'], f_post['message'], u'Post', u'Facebook'))
            if 'description' in f_post and len(f_post['description']) > 10:
                posts.append((f_post['id'], f_post['description'], u'Post', u'Facebook'))

    return posts
Ejemplo n.º 11
0
def get_description_language(content):
    """
    Parameters
    -------------
    content: bs4.element.tag
        element that contains the description data.
    Returns
    -------------
    str: document description and document language.
    """
    # There might be other ways they store descriptions, might need to add symbols
    possibilities = ['blockquote', 'p']
    description = None
    for tag in possibilities:
        description = content.find(tag)
        if description is not None:
            break
    if description is None:
        return 'No description', 'None'
    description = description.getText()
    if description == '':
        return 'No description', 'None'
    else:
        try:
            return description, detect(description)
        except LangDetectException:
            return 'No description', 'None'
Ejemplo n.º 12
0
    def analyze_font(self, fontid, samples):
        sampletext = ""

        # very involved way of getting a representative sample, since
        # an encoded font can be partially unencoded...
        for textbox in samples:
            decode_all = not('i' in [getattr(x, 'tag', None) for x in textbox])
            if decode_all:
                sampletext += etree.tostring(textbox, method="text",
                                             encoding="utf-8").decode("utf-8")
            else:
                for subpart in textbox:
                    if (isinstance(subpart, etree._Element) and
                        (decode_all or subpart.tag == 'i')):
                        if subpart.text: # it might be None, for eg "<i><b>text is in child instead</b></i>"
                            sampletext += subpart.text

        for low_offset, high_offset, unmapped in ((0,0, []),
                                                  (0x1d, 0x7a, []),
                                                  (0x20, 0x40, [0x20])):
            if low_offset and high_offset:
                encodingmap = self.encodingmap(low_offset, high_offset, unmapped)
                decoded_sample = self.decode_string(sampletext, encodingmap)
            else:
                encodingmap = None
                decoded_sample = sampletext
            try:
                lang = detect(decoded_sample)
                if lang == 'sv':
                    self.encodingmaps[int(fontid)] = encodingmap
                    return low_offset # used for diagnostic logging
            except LangDetectException:
                pass
        raise errors.PDFDecodeError("cannot detect how to decode font %s using %r" %
                                    (fontid, sampletext))
def build_dictionary(review_page):
    #dictionary for each product
    product = {}
    #counter used to detect review for particular product
    counter = 0
    #parsing the review page
    dom = parse(StringIO.StringIO(review_page))
    
    #extracting the item tags
    name = dom.getElementsByTagName('item')
    #iterating over each item
    for child in name:
        #extracting the given title tag from the item
        title = child.getElementsByTagName('title')[0]
        #extracting the text title from the item
        text = child.getElementsByTagName('text')[0]

        #checking if the corresponding tag is present in the item
        if(title.hasChildNodes is not None and len(title.childNodes) > 0 and  title.childNodes[0].data is not None
            and text.hasChildNodes is not None and len(text.childNodes) > 0 and  text.childNodes[0].data is not None):
            key = title.childNodes[0].data
            value = text.childNodes[0].data

            #checking if the product is present in the dictionary
            if key in product:
                product[key].append(value)
            else:
                #if the product is not present, then adding that in the dictionary, based on the language
                if counter == 0:
                    #langdect library used to detect the language of the particular review
                    language = langdetect.detect(value)
                    counter += 1
                product[key] = [value]
    dictionary = (language, product)
    return dictionary
Ejemplo n.º 14
0
def valid_language(text):
    supported_languages = settings.LANGUAGE_DETECTION
    if supported_languages:
        lang = langdetect.detect(text)
        if lang not in supported_languages:
            raise ValidationError(
                    'Language "{0}" is not one of the supported languages {1}!'.format(lang, supported_languages))
Ejemplo n.º 15
0
def titles(self, key, value):
    def is_main_title(key):
        return key.startswith('245')

    def is_translated_title(key):
        return key.startswith('242')

    titles = self.setdefault('titles', [])
    values = force_force_list(value)
    for val in values:
        title_obj = {
            'title': val.get('a'),
            'subtitle': force_single_element(val.get('b')),  # FIXME: #1484
            'source': val.get('9'),
        }
        if is_main_title(key):
            titles.insert(0, title_obj)
        elif is_translated_title(key):
            title = val.get('a')
            if title:
                lang = langdetect.detect(title)
                if lang:
                    title_obj['language'] = lang
                    self.setdefault('title_translations', []).append(title_obj)
        else:
            titles.append(title_obj)

    return titles
Ejemplo n.º 16
0
 def language_in_tweet(tweet):
     detected_lang = None
     try: 
         detected_lang = detect(tweet['text'])             
     except lang_detect_exception.LangDetectException:
         pass
     return  any([detected_lang in args])
Ejemplo n.º 17
0
def analyze(s, language=None):
    # Detect language if not provided
    if language is None:
        language = detect(s)
    if language not in ["en"]:
        raise ValueError("Language "+language+" not supported")

    # Load pattern
    pattern = importlib.import_module("pattern." + language)
    # Perform analysis
    analysis = {}
    pt = pattern.parsetree(s)
    analysis["wordPerSentence"] = stats([len(s.words) for s in pt])
    #Moods
    moods = Counter([pattern.mood(s) for s in pt])
    tot=sum([v for k,v in moods.iteritems()])
    analysis["moods"] = {}
    for k in moods.keys():
       analysis["moods"][k] = round(float(moods[k])/tot*100)
    #
    analysis["modality"] = stats([pattern.modality(s) for s in pt])
    sentiments = [pattern.sentiment(s) for s in pt]
    analysis["polarity"] = stats([s[0] for s in sentiments])
    analysis["subjectivity"] = stats([s[1] for s in sentiments])
    analysis["positivity"] = stats([int(pattern.positive(s)) for s in pt])
    return analysis
Ejemplo n.º 18
0
def detectLanguage(sentence):
	try:
		lang = detect(sentence)
		return lang 
	except Exception,e:
		print "--- ERROR detecting language ---"
		print e 
Ejemplo n.º 19
0
 def search_by_brand(self, brand):
     '''method to search by brand return the mall ids with results repartition'''
     self.collect_brands()
     if detect(brand) == "ja":
         if brand in self.brands["jap"].keys():
             return self.search_mall_id(self.brands["jap"][brand]["url"])
         else:
             brand_t = re.split("・|ー| |&", brand)
             for k,v in self.brands["jap"].items():
                 for t in v["tags"]:
                     if t == brand:
                         return(self.search_mall_id(self.brands["jap"][k]["url"]))
                     for tag in brand_t:
                         if tag == t:
                             return(self.search_mall_id(self.brands["jap"][k]["url"]))
     else:
         if brand.lower() in self.brands["en"].keys():
             
             return(self.search_mall_id(self.brands["en"][brand.lower()]["url"]))
         else:
             brand_t = re.split("・|ー| |&", brand)
             for k,v in self.brands["en"].items():
                 for t in v["tags"]:
                     if t == brand:
                         return(self.search_mall_id(self.brands["en"][k]["url"]))
                     for tag in brand_t:
                         if tag == t:
                             return(self.search_mall_id(self.brands["en"][k]["url"]))
Ejemplo n.º 20
0
def _get_art_context(record):
    reader = LiteratureReader(record)

    abstract = reader.abstract
    try:
        abstract_language = detect(abstract)
    except LangDetectException:
        abstract_language = ''

    return {
        'abstract': abstract,
        'abstract_language': abstract_language,
        'arxiv_id': reader.arxiv_id,
        'authors': get_authors(record),
        'collaborations': reader.collaborations,
        'divulgation': get_divulgation(record),
        'doi': get_doi(record),
        'domains': get_domains(record),
        'inspire_id': get_inspire_id(record),
        'journal_issue': get_journal_issue(record),
        'journal_title': get_journal_title(record),
        'journal_volume': get_journal_volume(record),
        'keywords': reader.keywords,
        'language': get_language(record),
        'page_artid': get_page_artid(record),
        'peer_reviewed': get_peer_reviewed(record),
        'publication_date': get_publication_date(record),
        'subtitle': reader.subtitle,
        'title': reader.title,
    }
Ejemplo n.º 21
0
    def _get_ocr(self, pngs):

        self._render("  OCRing the PDF", 2)

        raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)

        guessed_language = langdetect.detect(raw_text)

        self._render("    Language detected: {}".format(guessed_language), 2)

        if guessed_language not in ISO639:
            self._render("Language detection failed!", 0)
            if settings.FORGIVING_OCR:
                self._render("As FORGIVING_OCR is enabled, we're going to make the best " "with what we have.", 1)
                return raw_text
            raise OCRError

        if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
            return raw_text

        try:
            return self._ocr(pngs, ISO639[guessed_language])
        except pyocr.pyocr.tesseract.TesseractError:
            if settings.FORGIVING_OCR:
                self._render(
                    "OCR for {} failed, but we're going to stick with what "
                    "we've got since FORGIVING_OCR is enabled.".format(guessed_language),
                    0,
                )
                return raw_text
            raise OCRError
Ejemplo n.º 22
0
def read_data():
	"""
	INPUT: None
	OUTPUT: pandas data frame from file
	"""

	list_of_files = glob.glob('app/uploads/*.csv') # * means all if need specific format then *.csv
	latest_file = max(list_of_files, key=os.path.getctime)

	df = pd.read_csv(latest_file, skiprows = 12, usecols=range(0,12))

	if df.Platform == 'iOS':

		keep = ['Date', 'App ID', 'App Name', 'User', 'Version', 'Rating', 'Review']
		df = df[keep]
		df.columns = ['date', 'business_id', 'business_name', 'user_name', 'version', 'review_stars', 'text']

	else:

		df = df[df.Language == 'English']
		keep = ['Date', 'App Name', 'Publisher ID', 'User', 'Rating', 'Review']
		df = df[keep]
		df.columns = ['date', 'business_name', 'business_id', 'user_name', 'review_stars', 'text']

	for rev in df['text']:
    	try:
        	df['lang'] = detect(rev)
    	except:
        	pass

    df = df[df.lang == 'en']

	return df
 def is_english(s):
     """Predicate that estimates whether a given string is in English"""
     try: 
         return langdetect.detect(s) == 'en'
     except:
         print("Couldn't detect the language of: {}".format(s))
         return True
Ejemplo n.º 24
0
	def get_lang(self):
		"""
		Detect language from the body. This method takes some time
		@return string lang     can be 'fr' or 'en'
		"""
		lang   = detect(self.get_body())
		return lang
Ejemplo n.º 25
0
 def _guess_language(self, text):
     try:
         guess = langdetect.detect(text)
         self.log("debug", "Language detected: {}".format(guess))
         return guess
     except Exception as e:
         self.log("warning", "Language detection error: {}".format(e))
Ejemplo n.º 26
0
def translate_to_en(input_text):
    source = detect(input_text)
    translated = service.translations().list(q = input_text,
                                             target = 'en',
                                             source = source).execute()

    return (source, translated['translations'][0]['translatedText'])
def czech_filter(text):
    '''filter Czech text with usage of langdetect library. 
    To filter other language simply change 'cs' to other value, e.g. 'de', 'sk', 'pl' etc.
    '''
    if langdetect.detect(text) == 'cs':
        return True
    return False
Ejemplo n.º 28
0
def filter_russian(content_file_path, output_file_path):
    from langdetect import detect, lang_detect_exception
    line_count = 0
    line_count_mod = 1
    lang = None

    print 'Content file:', content_file_path
    print 'Filtering Russian language...'
    out = codecs.open(output_file_path, 'w+', encoding=ENCODING)
    with codecs.open(content_file_path, encoding=ENCODING) as content_file:
        for line in content_file:
            try:
                lang = detect(line)
            except lang_detect_exception.LangDetectException as e: 
                pass

            if lang == 'ru':
                out.write(line)

            line_count += 1
            if line_count % line_count_mod == 0:
                print line_count, 'lines processed...'
                line_count_mod *= 2
    
    print line_count, 'lines processed in total.'
    out.close()
Ejemplo n.º 29
0
def produce_raw_layer():
    try:
        print 'START: Insert of data into database at %s.' % datetime.datetime.now()
        cycle_start_time = datetime.datetime.now()
        recent_media_added = 0
        users_added = 0
        users_updated = 0
        user_recent_media_added = 0

        # Get recent popular media
        recent_media = api.media_popular(count=64)
        for media in recent_media:
            # Parse the recent popular media
            parsed_media = RawRecentMediaEntity.parse(media)

            # Determine if english speaking user, if so, continue
            ## TODO: Maybe detect all possible languages and then if 'en' is in it, it passes
            try:
                if langdetect.detect(parsed_media.caption_text) != 'en':
                    continue
            except LangDetectException:
                continue

            # Save the parsed media
            parsed_media.save()
            recent_media_added += 1

            user_recent_media_added, users_added, users_updated = handle_user_info(parsed_media,
                                                                                  user_recent_media_added,
                                                                                  users_added, users_updated)

        log_run_metrics(cycle_start_time, recent_media_added, users_added, users_updated, user_recent_media_added)
    except Exception as e:
        print("ERROR - userId: %d caused error: " + str(e))
        pass
Ejemplo n.º 30
0
    def fetch_item_lyrics(self, lib, item, write, force):
        """Fetch and store lyrics for a single item. If ``write``, then the
        lyrics will also be written to the file itself."""
        # Skip if the item already has lyrics.
        if not force and item.lyrics:
            self._log.info(u"lyrics already present: {0}", item)
            return

        lyrics = None
        for artist, titles in search_pairs(item):
            lyrics = [self.get_lyrics(artist, title) for title in titles]
            if any(lyrics):
                break

        lyrics = u"\n\n---\n\n".join([l for l in lyrics if l])

        if lyrics:
            self._log.info(u"fetched lyrics: {0}", item)
            if HAS_LANGDETECT and self.config["bing_client_secret"].get():
                lang_from = langdetect.detect(lyrics)
                if self.config["bing_lang_to"].get() != lang_from and (
                    not self.config["bing_lang_from"] or (lang_from in self.config["bing_lang_from"].as_str_seq())
                ):
                    lyrics = self.append_translation(lyrics, self.config["bing_lang_to"])
        else:
            self._log.info(u"lyrics not found: {0}", item)
            fallback = self.config["fallback"].get()
            if fallback:
                lyrics = fallback
            else:
                return
        item.lyrics = lyrics
        if write:
            item.try_write()
        item.store()
Ejemplo n.º 31
0
    def __process_tweet(self,
                        tweet,
                        group_name,
                        flush_output=True,
                        verbosity=True):
        # Fetch the full text of the tweet
        if 'extended_tweet' in tweet and tweet['extended_tweet']:
            if flush_output and verbosity:
                print("\t\t Extended tweet\n")
            text = tweet['extended_tweet']['full_text']
        else:
            text = tweet['text']

        text = text.encode('utf-16', 'surrogatepass').decode('utf-16')

        # Fetch the tweet source
        pattern = re.compile("(\>)(.+)(\<)")
        source = pattern.search(tweet['source']).group(2)
        if flush_output and verbosity:
            print("\t\tTweeted using {}\n".format(source))

        if tweet['lang']:
            lang = tweet['lang']
            print("\t\tTweet language is {}\n".format(lang))
        else:
            lang = detect(text)
            if flush_output and verbosity:
                print("\t\tLanguage detected as {}\n".format(lang))

        document = {
            '_id':
            tweet['id_str'],
            'text':
            text,
            'lang':
            lang,
            'source':
            source,
            'category':
            group_name,
            'quotes':
            tweet['quote_count'],
            'replies':
            tweet['reply_count'],
            'faves':
            tweet['favorite_count'],
            'retweets':
            tweet['retweet_count'],
            'created_at':
            tweet['created_at'],
            'quoted_tweet':
            tweet['quoted_status_id_str']
            if 'quoted_status_id_str' in tweet else None,
            'user': {
                '_id': tweet['user']['id_str'],
                'name': tweet['user']['name'],
                'username': tweet['user']['screen_name'],
                'location': tweet['user']['location'],
                'verified': tweet['user']['verified'],
                'followers': tweet['user']['followers_count'],
                'followings': tweet['user']['friends_count'],
                'favourites': tweet['user']['favourites_count'],
                'statuses': tweet['user']['statuses_count']
            }
        }

        if flush_output:
            print("\t\tSaving the tweet... ")

        status, mode, record_id = self.dbi.upsert('tweets',
                                                  {'_id': tweet['id_str']},
                                                  document)

        altered = False

        if status:
            if flush_output:
                print("Done")

            if mode == self.dbi.MODE_INSERTED:
                altered = True
                if flush_output:
                    print(" - Inserted")
            elif mode == self.dbi.MODE_UPDATED:
                altered = True
                if flush_output:
                    print(" - Updated")
            elif mode == self.dbi.MODE_NOT_CHANGED and flush_output:
                print(" - No Change")
        elif flush_output:
            print("Failed")

        if flush_output:
            print("\n")

        return altered
Ejemplo n.º 32
0
 def _language(self, article):
     if not article.meta_lang:
         text = article.title + ' ' + article.summary
         return detect(text)
     return article.meta_lang
Ejemplo n.º 33
0
    def find_or_create(cls,
                       session,
                       _url: str,
                       language=None,
                       sleep_a_bit=False):
        """

            If not found, download and extract all
            the required info for this article.

        :param url:
        :return:
        """
        from zeeguu_core.model import Url, Article, Language
        import newspaper

        url = Url.extract_canonical_url(_url)

        try:
            found = cls.find(url)
            if found:
                return found

            art = newspaper.Article(url=url)
            art.download()
            art.parse()

            if art.text == '':
                raise Exception("Newspaper got empty article from: " + url)

            if sleep_a_bit:
                import time
                from random import randint
                print("GOT: " + url)
                sleep_time = randint(3, 33)
                print(
                    f"sleeping for {sleep_time}s... so we don't annoy our friendly servers"
                )
                time.sleep(sleep_time)

            if not language:
                if art.meta_lang == '':
                    art.meta_lang = detect(art.text)
                    zeeguu_core.log(f"langdetect: {art.meta_lang} for {url}")
                language = Language.find_or_create(art.meta_lang)

            # Create new article and save it to DB
            url_object = Url.find_or_create(session, url)

            new_article = Article(
                url_object,
                art.title,
                ', '.join(art.authors),
                art.text[
                    0:
                    32000],  # any article longer than this will be truncated...
                art.summary,
                None,
                None,
                language)
            session.add(new_article)

            session.commit()

            return new_article
        except sqlalchemy.exc.IntegrityError or sqlalchemy.exc.DatabaseError:
            for i in range(10):
                try:
                    session.rollback()
                    u = cls.find(url)
                    print("Found article by url after recovering from race")
                    return u
                except:
                    print("Exception of second degree in article..." + str(i))
                    time.sleep(0.3)
                    continue
                break
Ejemplo n.º 34
0
def get_lang(text):
    return detect(text)
Ejemplo n.º 35
0
# -*- coding: utf-8 -*-
"""
Created on Sat Mar  3 20:20:50 2018

@author: rqz
"""

import pandas as pd
#改下文件名
df = pd.read_csv('/Users/yilixia/Downloads/raw_test_ylxia.csv')
from googletrans import Translator
from langdetect import detect

chinese = []
other_lan = []

for i in range(len(df)):
    if detect(df.iloc[i, 1]) == 'ko' or detect(df.iloc[i, 1]) == 'zh-tw':
        chinese.append(i)
    elif detect(df.iloc[i, 1]) != 'en':
        translator = Translator()
        fake2 = translator.translate(df.iloc[i, 1])
        df.iloc[i, 1] = fake2.text
        other_lan.append(i)
    else:
        print(i)
#剩下的中文手动翻译好了之后 输出
df.to_csv('/Users/yilixia/Downloads/translation_lyi.csv',
          index=False,
          encoding='utf-8')
Ejemplo n.º 36
0
def func_news_retrieve(*args, **kwarg):
    #init console log
    print("[01_news_retrieve] S Started job at " +
          str(datetime.datetime.utcnow()))

    #grab the current time
    dt = datetime.datetime.utcnow()

    #create a dictionary of rss feeds
    feeds = dict(
        thaipr_fin=r'http://www.thaipr.net/finance/feed',
        thaipr_property=r'http://www.thaipr.net/estate/feed',
        posttoday_econ=r'https://www.posttoday.com/rss/src/economy.xml',
        posttoday_fin=r'https://www.posttoday.com/rss/src/money.xml',
        posttoday_market=r'https://www.posttoday.com/rss/src/market.xml',
        posttoday_property=r'https://www.posttoday.com/rss/src/property.xml',
        bbkbiznews_buz=r'http://www.bangkokbiznews.com/rss/feed/business.xml',
        bkkbiznews_econ=r'http://www.bangkokbiznews.com/rss/feed/economic.xml',
        bkkbiznews_fin=r'http://www.bangkokbiznews.com/rss/feed/finance.xml',
        bkkbiznews_property=
        r'http://www.bangkokbiznews.com/rss/feed/property.xml',
        thaipbs_econ=r'http://news.thaipbs.or.th/rss/news/economy',
        matichon_econ=r'https://www.matichon.co.th/category/economy/feed',
        manager_stock=
        r'http://www.manager.co.th/RSS/StockMarket/StockMarket.xml',
        manager_mutualfund=
        r'http://www.manager.co.th/RSS/MutualFund/MutualFund.xml',
        manager_biz=r'http://www.manager.co.th/RSS/iBizChannel/iBizChannel.xml',
    )

    news_cat = dict(thaipr_fin='Finance',
                    thaipr_property='Property',
                    posttoday_econ='Economy',
                    posttoday_fin='Finance',
                    posttoday_market='Business',
                    posttoday_property='Property',
                    bbkbiznews_buz='Business',
                    bkkbiznews_econ='Economy',
                    bkkbiznews_fin='Finance',
                    bkkbiznews_property='Property',
                    thaipbs_econ='Economy',
                    matichon_econ='Economy',
                    manager_stock='Finance',
                    manager_mutualfund='Finance',
                    manager_biz='Business')

    news_source = dict(thaipr_fin='ThaiPR',
                       thaipr_property='ThaiPR',
                       posttoday_econ='PostToday',
                       posttoday_fin='PostToday',
                       posttoday_market='PostToday',
                       posttoday_property='PostToday',
                       bkkbiznews_buz='BangkokBizNews',
                       bkkbiznews_econ='BangkokBizNews',
                       bkkbiznews_fin='BangkokBizNews',
                       bkkbiznews_property='BangkokBizNews',
                       thaipbs_econ='ThaiPBS',
                       matichon_econ='Matichon',
                       manager_stock='Manager',
                       manager_mutualfund='Manager',
                       manager_biz='Manager')

    data = []
    count_insert = 0
    count_duplicate = 0
    filterBOTKeyword = [
        'ธปท', 'ธนาคารแห่งประเทศไทย', 'ธนาคารชาติ', 'ธนาคารกลาง', 'แบงค์ชาติ',
        'แบงก์ขาติ', 'Bank of Thailand', 'กนง', 'คณะกรรมการนโยบายการเงิน',
        'ศคง', 'ศูนย์คุ้มครองผู้ใช้บริการทางการเงิน',
        'สถาบันวิจัยเศรษฐกิจป๋วย อึ๊งภากรณ์', 'กองทุนเพื่อการฟื้นฟู', 'FIDF',
        'วิรไท สันติประภพ', 'ไพบูลย์ กิตติศรีกังวาน', 'เมธี สุภาพงษ์',
        'วชิรา อารมย์ดี', 'จาตุรงค์ จันทรังษ์', 'ฤชุกร สิริโยธิน',
        'รณดล นุ่มนนท์', 'สิริธิดา พนมวัน ณ อยุธยา', 'ณัฐวุฒิ พงศ์สิริ',
        'เพิ่มสุข สุทธินุ่น', 'วรพร ตั้งสง่าศักดิ์ศรี', 'นวพร มหารักขกะ',
        'พฤทธิพงศ์ ศรีมาจันทร์', 'สุภาวดี ปุณศรี', 'จันทวรรณ สุจริตกุล',
        'ปิติ ดิษยทัต', 'สักกะภพ พันธ์ยานุกูล', 'ดอน นาครทรรพ', 'สุรัช แทนบุญ',
        'ยรรยง ไทยเจริญ', 'รุ่ง มัลลิกะมาส'
    ]

    # Access the 'headlines' collection in the 'news' database
    client = pymongo.MongoClient()
    collection = client.sentifine.news_map
    collection_fin = client.sentifine.news_raw

    for feed, url in feeds.items():

        rss_parsed = feedparser.parse(url)

        for art in rss_parsed['items']:
            #Filter only Thai language from title
            lang = detect(art['title'])
            #print(art)
            if lang == 'th':

                #Checking if each news related with BOT
                filter_bot = 'N'
                if any(k in str(art['title'])
                       for k in filterBOTKeyword) or (any(
                           k in str(art['title_detail'])
                           for k in filterBOTKeyword)) or (any(
                               k in str(art['summary'])
                               for k in filterBOTKeyword)):
                    filter_bot = 'Y'

                published = parser.parse(art['published'])
                sentiment_default = "Retrieved"
                m = {
                    '_id': art['link'],
                    'title': art['title'],
                    'published': published,
                    'url_link': art['link'],
                    'retrieved': dt
                }

                r = {
                    'source': news_source.get(feed),
                    'source_url': feed,
                    'title': art['title'],
                    'published': published,
                    'title_detail': art['title_detail']['value'],
                    'summary': art['summary'],
                    'category': news_cat.get(feed),
                    'url_link': art['link'],
                    'retrieved': dt,
                    'filter_BOT': filter_bot,
                    'status': sentiment_default
                }

                #insert item by item because of the duplicate of some source's links
                try:
                    count_insert = count_insert + 1
                    collection.insert_one(m)  #news_map
                    collection_fin.insert_one(r)  #news_raw
                except pymongo.errors.DuplicateKeyError:
                    count_insert = count_insert - 1
                    count_duplicate = count_duplicate + 1
                    #pass #allow only this exception
                except Exception as ex:
                    print(
                        "[01_news_retrieve] E Unexpected error while inserting collection news_map & news_raw."
                    )
                    print(str(ex))
                    #raise
            else:
                print("[01_news_retrieve] W Non-Thai Content from: " +
                      art['link'])

    #final log
    print("[01_news_retrieve] I Number of Duplicated Records :" +
          str(count_duplicate))
    print("[01_news_retrieve] I Number of New Records :" + str(count_insert))
    print("[01_news_retrieve] S Finished job at " +
          str(datetime.datetime.utcnow()))
Ejemplo n.º 37
0
 def verify_language(self, text):
     """given a text, verify that it is in a relevant language"""
     return langdetect.detect(text) == 'fr'
from collections import Counter
import matplotlib.pyplot as plt

# read the csv file
df = pd.read_csv("music_lyrics.csv")

# remove columns that contains N/A values
df.drop(df.columns[[0, 1]], axis=1, inplace = True)
df1 = df.dropna()
mydf = df1[df1.lyrics != 'No Lyrics']
DetectorFactory.seed = 0

# detect lyrics language types
lang = []
for i in list(set(mydf.lyrics)):
    lang.append(detect(i))

# get the counts for languages of lyrics
Counter(lang)

# Pie chart for English lyrics and lyrics of other languages 
labels = ['English', 'Other languages']
sizes = [3982, 37]

# change figure size with modify figsize
fig, ax1 = plt.subplots(figsize=(8, 8))

# explsion
explode = (0.05,0.05)
ax1.pie(
    sizes, labels=labels, autopct='%1.1f%%', startangle=90, 
Ejemplo n.º 39
0
def safedetect(text):
    try:
        return detect(text)
    except:
        return 'nan'
Ejemplo n.º 40
0
        if count < running_from:
            continue
        #if count > running_to:
        #    break

        print('Processing mm ' + str(count) + ': ' + mul_id)

        # retrieve data and save in SQLite
        data = []

        if type == 'image':
            links = link_retrieval.find_related_links(abs_path, nPages)
            for l in links:
                text = getTextFromLink(l)
                try:
                    if detect(text) == 'en':
                        print("===>" + l)
                        # accumulate data
                        data.append((mul_id, l, text))
                except Exception as e:
                    print(e)

        if type == 'video':
            text = getTextFromVideoLink(abs_path)
            if text != '':
                print("===>" + abs_path)
                # accumulate data
                data.append((mul_id, abs_path, text))

        # insert data into database
        c.executemany("INSERT INTO website_from_img VALUES (?,?,?)", data)
 def find_lang(x):
     try:
         return detect(x)
     except:
         return 'none'
Ejemplo n.º 42
0
from gtts import gTTS
import os
from langdetect import detect

Text_generated = "hello"
language = detect(Text_generated)
text = Text_generated
speech = gTTS(text=text, lang=language, slow=False)
speech.save("text2.mp3")
os.system("start text2.mp3")
'''import pytesseract
import shutil
import os
import random
try:
  from PIL import Image
except ImportError:
  import Image
import glob
import cv2
def read_img(img_list, img):
    n = cv2.imread(img, 0)
    img_list.append(n)
    return img_list

path = glob.glob("*.bmp") #or jpg
list_ = []

cv_image = [read_img(list_, img) for img in path]

image_path_in_colab=r'C:\\Users\\charv\\PicTalk\\uploads\\spacejam2.png'
def detect_title(title: str):
    str = title
    strc = " ".join([token.capitalize() for token in str.split()])
    lang = resolve(detect(strc))
    return lang
Ejemplo n.º 44
0
def tokenization_process(
    text: str
) -> list:  #tokenization of text words using spacy and other techniques

    if re.sub(re.compile('\d|\:|\s|\-|\+|\!|\/|\,|\.|\=|\?|\!|\砰'), '',
              text) != '':

        #STOPWORDS
        lang = [
            'arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish',
            'french', 'german', 'greek', 'hungarian', 'indonesian', 'italian',
            'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian',
            'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish'
        ]

        try:
            stopwords = set(sw.words('english'))
        except:
            stopwords = set()

        for l in lang:
            try:
                stopwords = stopwords.union(set(sw.words(l)))
            except:
                stopwords = stopwords

        try:
            language = detect(text)
        except:
            language = 'en'

        if (language == 'en'):
            nlp = spacy.load("en_core_web_sm")
        elif (language == 'zh'):
            nlp = spacy.load("zh_core_web_sm")
        elif (language == 'da'):
            nlp = spacy.load("da_core_news_sm")
        elif (language == 'nl'):
            nlp = spacy.load("nl_core_news_sm")
        elif (language == 'fr'):
            nlp = spacy.load("fr_core_news_sm")
        elif (language == 'de'):
            nlp = spacy.load("de_core_news_sm")
        elif (language == 'el'):
            nlp = spacy.load("el_core_news_sm")
        elif (language == 'it'):
            nlp = spacy.load("it_core_news_sm")
        elif (language == 'ja'):
            nlp = spacy.load("ja_core_news_sm")
        elif (language == 'lt'):
            nlp = spacy.load("lt_core_news_sm")
        elif (language == 'nb'):
            nlp = spacy.load("nb_core_news_sm")
        elif (language == 'pl'):
            nlp = spacy.load("pl_core_news_sm")
        elif (language == 'pt'):
            nlp = spacy.load("pt_core_news_sm")
        elif (language == 'ro'):
            nlp = spacy.load("ro_core_news_sm")
        elif (language == 'es'):
            nlp = spacy.load("es_core_news_sm")
        else:
            # print('inter')
            nlp = spacy.load("xx_ent_wiki_sm")

        tokens = [
            x.lemma_.lower() for x in nlp(text)
            if (x.pos_ not in ['PUNCT', 'SPACE']) and (not x.is_stop)
        ]

        trash_tokens = [
            '–', '-', 'le', 'de', 'del', 'dell', 'della', 'l', 'degli',
            "dell'", "l'", '’', 'l’', 'dell’', '.', '?', '!', '¡', 'a', 'do',
            '(', ')', 'e-', 'e', 'el', 'r', 'n', 'se', 'una', 'alla', 'la',
            "'", 'to', 'of', 'o', "'n", 'y', "'s", ',', "'t", 'don', 'the',
            '・', 'u', '」', '「', 'в', 'por', 'el', 'du', 'les', ''
        ]

        tokens = [
            x for x in tokens if (x not in punctuation) and (
                x not in stopwords) and (x not in trash_tokens)
        ]

        return tokens

    else:
        return [text]
Ejemplo n.º 45
0
def detect_language(x):
    return detect(x)
Ejemplo n.º 46
0
    count = -1
with open('links.txt', 'r') as f:
    links = f.readlines()
    for i in trange(len(links)):
        if i <= count:
            continue
        link = links[i]
        try:
            html = requests.get(link).text
        except ConnectionError or ChunkedEncodingError:
            continue
        soup = BeautifulSoup(html, 'html.parser')
        song = soup.find_all('p', {'class': 'songtext'})[0]
        song_filtered = song.get_text()
        try:
            lang = detect(song_filtered[:50])
        except Exception:
            continue
        if lang != 'en':
            with open('songs_done.txt', 'w') as done_file:
                done_file.write(str(i))
            continue
        song_title = re.findall(r'/s.*?.html', link)[0][1:-5]
        try:
            with open(songs_folder + song_title + '.txt', 'w') as sf:
                sf.write(song_filtered)
        except UnicodeEncodeError as err:
            print(song_title + lang)
            print('%d files skipped' % (files_skipped + 1))
            files_skipped += 1
            try:
Ejemplo n.º 47
0
def detect_lang_type(x):
    lang_type=(detect(x))

    return lang_type
Ejemplo n.º 48
0
def is_french(stringx):
    try:
        if detect(stringx.lower()) == 'fr':
            return True
    except:
        return False
    def parse_page(self, response, **kwargs):
        try:
            o = {}

            title = response.xpath(
                '//h1[@class="citation_title"]/text()').get()
            if title:
                self.parser.reset()
                self.parser.feed(title)
                title = self.parser.get_text()
                d_l = detect(title)
                if d_l == 'fa':
                    t = {'title_fa': title}
                else:
                    return None
                t['title_en'] = None
                o.update(t)
            else:
                return None

            t = response.xpath(
                '(//span[@id="ar_row_ind"]/following-sibling::a)[1]/text()'
            ).get()
            if t is not None:
                t = re.findall(number_pattern, t)
            try:
                volume = int(t[0])
                number = int(t[1])
            except (IndexError, ValueError, TypeError):
                volume = None
                number = None

            download_url = response.xpath('//a[@class="pdf"]/@href').get()
            download_url = response.urljoin(download_url)

            if download_url:
                file_name = os.path.basename(download_url)
            else:
                file_name = None

            summary = response.xpath('//td[@id="abs_fa"]').get()
            if summary:
                self.parser.reset()
                self.parser.feed(summary)
                summary = self.parser.get_text()
            t = {'summary_fa': None, 'summary_en': None}
            if summary:
                d_l = detect(summary)
                if d_l == 'fa':
                    t.update({'summary_fa': summary})
                elif d_l == 'en':
                    t.update({'summary_en': summary})
            o.update(t)

            o.update({
                'volume':
                volume,
                'number':
                number,
                'file_name':
                '%s_%s' % (self.name, file_name) if file_name else None,
                'download_url':
                download_url
            })

            keywords = response.xpath(
                '//a[starts-with(@href, "./?_action=article&kw=")]/text()'
            ).getall()

            keywords_fa = []
            keywords_en = []

            for kw in keywords:
                d_l = detect(kw)
                if d_l == 'fa':
                    keywords_fa.append(kw)
                elif d_l == 'en':
                    keywords_en.append(kw)

            o['keywords_fa'] = keywords_fa if keywords_fa else None
            o['keywords_en'] = keywords_en if keywords_en else None

            yield o
        except LangDetectException:
            pass
Ejemplo n.º 50
0
def generate_meme(path , file_name ,cmnd_type , cmnd_value, tpos=1 , bpos=1 ):



    lang=detect(cmnds_value[0])



    if lang =='ar':
        generate_meme_ar(path , file_name ,cmnd_type , cmnd_value, tpos=1 , bpos=1 )
        return

    for cmn in range(len(cmnd_type)):

        temp = cmnd_type[cmn]

        if temp == 'top':
            top_txt = cmnd_value [cmn]
        elif temp == 'bot':
            bot_txt  = cmnd_value [cmn]
        elif temp == 'bpos':
            b_per=int(cmnd_value [cmn])
        elif temp =='tpos':
            t_per=int(cmnd_value [cmn])
        elif temp =='font':
            font=int(cmnd_value [cmn])
        elif temp =='font size':
            font_size=int(cmnd_value [cmn])




    #tpo
    img = cv2.imread(path)

    ary=np.asarray(img)

    print(ary.shape)


    if tpos ==1:# left
        top_pos = 0
    elif tpos ==2:#mid
        top_pos = int(ary.shape[0] / 2)
    else:
        top_pos = int(ary.shape[0])

    if bpos ==1:# left
        bot_pos = 0
    elif bpos ==2:#mid
        bot_pos = int(ary.shape[0] / 2)
    else:
        bot_pos = int(ary.shape[0])


    print(100*ary.shape[1]/100)
    print( int(100*ary.shape[1]/100))
    top_percent = int(t_per*ary.shape[1]/100)
    bot_percent = int(b_per*ary.shape[1]/100)




    cv2.putText(img, top_txt, (top_pos, top_percent), font, font_size, (255, 255, 255), font_size, cv2.LINE_AA)

    cv2.putText(img, bot_txt, (bot_pos, bot_percent), font, font_size, (255, 255, 255), font_size,  cv2.LINE_AA)


    # Parameters are as follows:
    #
    # cv2.putText(img, text, (org), font, fontScale, color, thickness, linetype)
    #
    # img: your image
    # text: a string of text to print on image
    # org: bottom-left corner of the text string in the image (x,y)
    # font: font type
    # fontScale: font scale
    # color: text color (B,G,R)
    # thickness: text line thickness
    # lineType: line type (8)

    cv2.imwrite("./meme_generated/"+file_name,img)
Ejemplo n.º 51
0
                content_buff += "\n"
                continue

            content_buff_temp = ""
            if (line == prev_st):
                content_buff_temp = prev_tok
            else:
                if language_mixed_en(line):
                    word_list = active_content = line.split(" ")
                    text_tmp = ""
                    tokens_eng = ""
                    tokens_th = ""
                    for word in word_list:
                        # print(word)
                        try:
                            lang = detect(word)
                        except:
                            lang = "en"

                        if lang == "th":
                            # tokenize_thai(text):
                            # print(word + " : " + detect(word))
                            tokens_eng = tokenize_eng(text_tmp)
                            tokens_th = tokenize_thai(word)
                            content_buff_temp = content_buff_temp + " " + tokens_eng + " " + tokens_th
                            tokens_eng = ""
                            tokens_th = ""
                            text_tmp = ""
                        else:
                            text_tmp = text_tmp + " " + word
                    if text_tmp != "":
Ejemplo n.º 52
0
def detect(sentence):
    return langdetect.detect(sentence) != 'en'
Ejemplo n.º 53
0
if __name__ == "__main__":
    new_movies_df = pd.read_csv(NEW_MOVIES_PATH)
    new_movie_ids = get_movie_ids(new_movies_df.shape[0], count_movies)
    new_movies_data = list(new_movies_df.title + ". " + new_movies_df.plot)

    for ind, elem in enumerate(new_movies_df.description):
        if elem:
            new_movies_data[ind] += " " + elem

    sent_data = div_to_sent(new_movies_data)

    print(colored("Построение векторных представлений сюжетов", "yellow"))
    rubert_embedings = []
    multilingual_embedings = []
    for ind, plot in tqdm(enumerate(sent_data)):
        if detect(plot) != "ru":
            _, _, _, _, _, _, bert_pooler_outputs = multilingual_bert(plot)
            multilingual_embedings.append(
                {"embeding": bert_pooler_outputs.mean(axis=0), "index": ind}
            )
        else:
            _, _, _, _, _, _, bert_pooler_outputs = rubert(plot)
            rubert_embedings.append(
                {"embeding": bert_pooler_outputs.mean(axis=0), "index": ind}
            )

    multilingual_emb_matrix = [elem["embeding"] for elem in multilingual_embedings]
    rubert_emb_matrix = [elem["embeding"] for elem in rubert_embedings]

    print(colored("Предсказание векторов SVD", "yellow"))
    multilingual_svd = multilingual_2_svd.predict(multilingual_emb_matrix)
Ejemplo n.º 54
0
 def from_txt(self, file):
     with open(file, 'r') as f:
         data = f.readlines()
         for paragraph in data:
             if detect(paragraph) == 'en':
                 self.paragraphs.append(Text(paragraph))
Ejemplo n.º 55
0
def language_detect(entry):
    try:
        return detect(entry)
    except Exception as e:
        print(f"{e} Using default language as english.")
        return 'en'
Ejemplo n.º 56
0
def langtype(v):
    try:
        lang = detect(unicode(v)[0])
        return lang
    except:
        return v