Ejemplos de detect en Python, ejemplos de cld.detect en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: lang_detect.py Proyecto: theplant/goxgo

def detect_lang(text, lang_code_hint=None):
    """
    Try to detect the language of a given text.

    :param ustring text: Body of text of which we want to detect the language
    :param string lang_code_hint: Expected language code. May give bias towards the expected language

    :rtype: tuple
    :returns:
    detected language name in English,
    language code,
    reliability flag,
    list of possible languages - lists details for other possible matches of the form
    [('ENGLISH', 'en', text percentage, normalized score), ...]
    """
    try:
        detected_lang, detected_lang_code, is_reliable, textBytesFound, details = cld.detect(
            text, hintLanguageCode=lang_code_hint, pickSummaryLanguage=False, isPlainText=False, removeWeakMatches=False
        )
    except UnicodeEncodeError:
        detected_lang, detected_lang_code, is_reliable, textBytesFound, details = cld.detect(
            text.encode("utf-8"),
            hintLanguageCode=lang_code_hint,
            pickSummaryLanguage=False,
            isPlainText=False,
            removeWeakMatches=False,
        )
    except cld.error as e:
        raise Exception(str(err))

    possible_matches = []
    for d in details:
        possible_matches.append((d[0].title(), d[1], d[2]))
    return (detected_lang.title(), detected_lang_code, is_reliable, possible_matches)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: extractor.py Proyecto: pombredanne/pypln.backend

    def process(self, file_data):
        with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
            file_mime_type = m.id_buffer(file_data['contents'])
        metadata = {}
        if file_mime_type == 'text/plain':
            text = file_data['contents']
        elif file_mime_type == 'text/html':
            text = parse_html(file_data['contents'], True, ['script', 'style'])
        elif file_mime_type == 'application/pdf':
            text, metadata = extract_pdf(file_data['contents'])
        else:
            # If we can't detect the mimetype we add a flag that can be read by
            # the frontend to provide more information on why the document
            # wasn't processed.
            # XXX: We're returning an empty text because if we don't the
            # pipeline will run indefinitely. The right approach is to make
            # pypelinin understand an specific exception (something like
            # StopPipeline) as a signal to stop processing this pipeline.
            return {
                'mimetype': 'unknown',
                'text': "",
                'file_metadata': {},
                'language': ""
            }

        text, forced_decoding = trial_decode(text)

        if isinstance(text, unicode):
            # HTMLParser only handles unicode objects. We can't pass the text
            # through it if we don't know the encoding, and it's possible we
            # also shouldn't. There's no way of knowing if it's a badly encoded
            # html or a binary blob that happens do have bytes that look liked
            # html entities.
            text = HTMLParser().unescape(text)

        text = clean(text)

        if isinstance(text, unicode):
            language = cld.detect(text.encode('utf-8'))[1]
        else:
            language = cld.detect(text)[1]

        return {
            'text': text,
            'file_metadata': metadata,
            'language': language,
            'mimetype': file_mime_type,
            'forced_decoding': forced_decoding
        }

Ejemplo n.º 3

0

Mostrar archivo

Archivo: donation.py Proyecto: quinox/sda-donation-tracker-2

  def clean(self,bid=None):
    super(Donation,self).clean()
    if self.domain == 'LOCAL': # local donations are always complete, duh
      if not self.donor:
        raise ValidationError('Local donations must have a donor')
      self.transacationstate = 'COMPLETED'
    if not self.donor and self.transactionstate != 'PENDING':
      raise ValidationError('Donation must have a donor when in a non-pending state')
    if not self.domainId and self.donor and self.timereceived:
      self.domainId = str(calendar.timegm(self.timereceived.timetuple())) + self.donor.email
    bids = set()
    if bid:
      bids |= set([bid])
    bids |= set()|set(self.bids.all())
    bids = map(lambda b: b.amount,bids)
    bidtotal = reduce(lambda a,b: a+b,bids,Decimal('0'))
    if self.amount and bidtotal > self.amount:
      raise ValidationError('Bid total is greater than donation amount: %s > %s' % (bidtotal,self.amount))

    tickets = self.tickets.all()
    ticketTotal = reduce(lambda a,b: a+b, map(lambda b: b.amount, tickets), Decimal('0'))
    if self.amount and ticketTotal > self.amount:
      raise ValidationError('Prize ticket total is greater than donation amount: %s > %s' % (ticketTotal,self.amount))

    if self.comment and cld:
      if self.commentlanguage == 'un' or self.commentlanguage == None:
        detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect(self.comment.encode('utf-8'), hintLanguageCode ='en')
        if detectedLangCode in map(lambda x: x[0], LanguageChoices):
          self.commentlanguage = detectedLangCode
        else:
          self.commentlanguage = 'un'
    else:
      self.commentlanguage = 'un'

Ejemplo n.º 4

0

Mostrar archivo

Archivo: parse-and-download.py Proyecto: Akibalogh/biffle-prototype

def download_article_file(articleURL, articleFileDirectory, code):
	articleFilePath = articleFileDirectory + code
				
	# Download the article and save as file
	if (articleURL == ""):
		print "ERROR: Empty URL detected! File not created"
		return None
	else:
		# If a directory for files doesn't exist, create it
		dir = os.path.dirname(articleFileDirectory)

		if not os.path.isdir(dir):
			#print "Created directory: " + dir
			os.makedirs(dir)
		
		try:
			#fullArticle = urllib2.urlopen(articleURL)
			#fullArticleText = fullArticle.read()

			# Use boilerpipe to remove boilerplate and formatting
			extractor = Extractor(extractor='ArticleExtractor', url=articleURL)
			fullArticleText = extractor.getText()

			# Test to see if article is in English. If not, then return None
			top_language = cld.detect(fullArticleText.encode('utf-8'))[0]
			if (top_language != 'ENGLISH'):
				print "SKIPPED: Article is in " + top_language
				return None

			outfile = open(articleFilePath, 'w+')			
			outfile.write(fullArticleText.encode('ascii', 'ignore'))
			outfile.close

			# Use lxml's HTML cleaner to remove markup
			#htmltree = lxml.html.fromstring(fullArticleText)		
			#cleaner = lxml.html.clean.Cleaner(remove_unknown_tags=True)
			#cleaned_tree = cleaner.clean_html(htmltree)
			#return cleaned_tree.text_content()
			return fullArticleText
	

		except urllib2.HTTPError:
			print "ERROR: HTTPError. Article file download skipped: " + articleURL	
			return None

		except urllib2.URLError:
			print "ERROR: URLError. Article file download skipped: " + articleURL	
			return None

		except LookupError:
			print "ERROR: LookupError. Article file download skipped: " + articleURL	
			return None
		
		except UnicodeDecodeError:
			print "ERROR: UnicodeDecodeError. Article file download skipped: " + articleURL
			return None

		except:
	                print "ERROR: ", sys.exc_info()[0]
        	        return None

Ejemplo n.º 5

0

Mostrar archivo

Archivo: check_lang.py Proyecto: imclab/Twitter_DA_Recognition

def check_german(tweet_text):
    emoji_key = pandas.read_csv('DATA/emoji_table.txt', encoding='utf-8', index_col=0)
    emoji_key['count'] = 0
    emoji_dict = emoji_key['count'].to_dict()
    emoji_dict = emoji_key['count'].to_dict()
    emoji_dict_total = emoji_key['count'].to_dict()
    emoji_list = emoji_dict.keys()


    tweet_text = unicode(tweet_text, 'utf-8')
    tweet_text = tweet_text.encode('utf-8')
    tokens = tweet_text.split(' ')
    new_text = ''
    #delete @username
    for token in tokens:
        if '@' not in token:
            new_text += token + ' '
    new_text = new_text.lower()
    text = unicode(new_text, 'utf-8')
    text = text.encode('utf-8')
    top_language_name = cld.detect(text)
    lang_form_langid = langid.classify(text)
    if new_text == '':
        return True
    #if text emty - german
    if top_language_name[0] == 'GERMAN' or lang_form_langid[0] == 'de':
        return True
    else:
        return False

Ejemplo n.º 6

0

Mostrar archivo

def langDetect(s):
    import cld

    langsSeen = set()
    detLangsSeen = set()

    detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect(
        s, pickSummaryLanguage=True, removeWeakMatches=False)

    if DEBUG:
        log('CLD :')
        log('  detected: %s' % detectedLangName)
        log('  reliable: %s' % (isReliable != 0))
        log('  textBytes: %s' % textBytesFound)
        log('  details: %s' % str(details))
        for tup in details:
            detLangsSeen.add(tup[0])
        log('  %d langs; %d ever detected' %
            (len(langsSeen), len(detLangsSeen)))
        log("\n")

    if detectedLangName == 'Unknown':
        return 'Unknown'
    else:
        return [i[1] for i in details]

Ejemplo n.º 7

0

Mostrar archivo

    def runOne(self, expectedLangName, s, shouldBeReliable=True):
        if VERBOSE:
            print
            print 'Test: %s [%d bytes]' % (expectedLangName, len(s))
        detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect(
            s, pickSummaryLanguage=True, removeWeakMatches=False)
        if VERBOSE:
            print '  detected: %s' % detectedLangName
            print '  reliable: %s' % (isReliable != 0)
            print '  textBytes: %s' % textBytesFound
            print '  details: %s' % str(details)
            self.langsSeen.add(expectedLangName)
            for tup in details:
                self.detLangsSeen.add(tup[0])
            print '  %d langs; %d ever detected' % (len(
                self.langsSeen), len(self.detLangsSeen))

            if False:
                if expectedLangName == 'YIDDISH':
                    l = list(self.detLangsSeen)
                    l.sort()
                    for i, name in enumerate(l):
                        print '  PyTuple_SET_ITEM(pyDetLangs, %d, PyString_FromString("%s"));' % (
                            i, name)

        self.assertEquals(
            expectedLangName, detectedLangName, '%s != %s; details: %s' %
            (detectedLangName, expectedLangName, str(details)))
        self.assertTrue(not shouldBeReliable or isReliable)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: check_lang.py Proyecto: anukat2015/Twitter_DA_Recognition

def check_german(tweet_text):

    if isinstance(tweet_text, unicode) is False:
        tweet_text = unicode(tweet_text, 'utf-8')
        tweet_text = tweet_text.encode('utf-8')
    tokens = tweet_text.split(' ')
    new_text = ''
    #delete @username
    for token in tokens:
        if '@' not in token:
            new_text += token + ' '
    new_text = new_text.lower()
    if isinstance(new_text, unicode) is False:
        text = unicode(new_text, 'utf-8')
        text = text.encode('utf-8')
    else:
        text = new_text.encode('utf-8')
    top_language_name = cld.detect(text)
    lang_form_langid = langid.classify(text)
    if new_text == '':
        return True
    #if text empty - german
    if top_language_name[0] == 'GERMAN' or lang_form_langid[0] == 'de':
        return True
    else:
        return False

Ejemplo n.º 9

0

Mostrar archivo

    def spellcheck(self, text, tld=''):
        from scanner.models import BadWord
        # guess language code
        self.log.debug('    * guessing language...')
        #lang_code, lang_num, lang_name = guess_language.guessLanguageInfo(text)
        lang_name, lang_code, reliable, bytes_found, details = \
            cld.detect(text.encode('utf-8'), hintTopLevelDomain=tld)
        self.log.debug('    -> detected lang: %s (%s)' %
                       (lang_name, lang_code))

        if lang_code.upper() == 'UNKNOWN' or lang_name.upper(
        ) == 'UNKNOWN' or not reliable:
            self.log.warning(
                '    -> Cannot detect language of page - end : %s' % details)
            return None, set()

        self.log.debug('    * searching for dictionary')
        try:
            checker = enchant.checker.SpellChecker(
                lang_code,
                filters=[
                    EmailFilter,
                    URLFilter,
                    #  BetterURLFilter,
                ])
        except enchant.DictNotFoundError:
            if lang_code in self.not_supported_lang:
                self.log.debug(
                    "    -> Cannot find language for spellchecker for %s - end (blacklisted)"
                    % lang_code)
            else:
                self.log.error(
                    "    -> Cannot find language for spellchecker for %s - end"
                    % lang_code)
            return None, set()

        # checking page for bad words
        self.log.debug('    * check spelling...')
        checker.set_text(text)
        self.log.debug('    -> ok')

        self.log.debug('    * get errors...')
        errors = [er.word for er in checker if len(er.word) < 128]
        self.log.debug('    -> ok')

        self.log.debug('      * found %d bad words and adding them to DB' %
                       len(errors))
        BadWord.objects.bulk_create(
            [BadWord(word=bad_word.strip().lower()) for bad_word in errors])
        self.log.debug('      -> ok')

        self.log.debug('     * call filtering bad words')
        errors = BadWord.filter_bad_words(errors)
        self.log.debug('      -> ok')

        self.log.debug('     * after filtering out there is %d errors (%s)' %
                       (len(errors), errors))

        return lang_name, set(errors)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: downloader.py Proyecto: elisamussumeci/mediacloud_backend

def detect_language(text):
    """
    Detect the language of text using chromium_compact_language_detector
    :param text: text to be analyzed
    :return: {"name": portuguese, "pt"}
    """
    name, code, isReliable, textBytesFound, details = cld.detect(text.encode('utf8'))
    return {"name": name, "code": code}

Ejemplo n.º 11

0

Mostrar archivo

Archivo: downloader.py Proyecto: turicas/mediacloud_backend

def detect_language(text):
    """
    Detect the language of text using chromium_compact_language_detector
    :param text: text to be analyzed
    :return: {"name": portuguese, "pt"}
    """
    name, code, isReliable, textBytesFound, details = cld.detect(text.encode('utf8'))
    return {"name": name, "code": code}

Ejemplo n.º 12

0

Mostrar archivo

Archivo: tweet_filter.py Proyecto: qntfy/trawler

 def filter(self, json_tweet_string):
     tweet = json.loads(json_tweet_string)
     # CLD expects a bytestring encoded as UTF-8, and not a unicode string
     tweet_text = codecs.encode(tweet['text'], 'utf-8')
     # Per the CLD docs, "isReliable is True if the top language is much better than 2nd best language."
     topLanguageName, topLanguageCode, isReliable, textBytesFound, details = cld.detect(tweet_text)
     if topLanguageName == "ENGLISH" and isReliable:
         return True
     else:
         return False

Ejemplo n.º 13

0

Mostrar archivo

Archivo: Tweet.py Proyecto: filippog/doaddoad

    def __init__(self, status):
        self.status = status
        self.cld_result = None

        try:
            # topLanguageName, topLanguageCode, isReliable, textBytesFound, details
            self.cld_result = cld.detect(status.text.encode("ascii", "ignore"),
                                         isPlainText=True,
                                         includeExtendedLanguages=False)
        except UnicodeEncodeError, e:
            log.warn("language detection failed on %s" % repr(status.text))

Ejemplo n.º 14

0

Mostrar archivo

Archivo: extractor.py Proyecto: NAMD/pypln.backend

    def process(self, file_data):
        with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
            file_mime_type = m.id_buffer(file_data['contents'])
        metadata = {}
        if file_mime_type == 'text/plain':
            text = file_data['contents']
        elif file_mime_type == 'text/html':
            text = parse_html(file_data['contents'], True, ['script', 'style'])
        elif file_mime_type == 'application/pdf':
            text, metadata = extract_pdf(file_data['contents'])
        else:
            # If we can't detect the mimetype we add a flag that can be read by
            # the frontend to provide more information on why the document
            # wasn't processed.
            # XXX: We're returning an empty text because if we don't the
            # pipeline will run indefinitely. The right approach is to make
            # pypelinin understand an specific exception (something like
            # StopPipeline) as a signal to stop processing this pipeline.
            return {'mimetype': 'unknown', 'text': "",
                    'file_metadata': {}, 'language': ""}

        text, forced_decoding = trial_decode(text)

        if isinstance(text, unicode):
            # HTMLParser only handles unicode objects. We can't pass the text
            # through it if we don't know the encoding, and it's possible we
            # also shouldn't. There's no way of knowing if it's a badly encoded
            # html or a binary blob that happens do have bytes that look liked
            # html entities.
            text = HTMLParser().unescape(text)

        text = clean(text)

        if isinstance(text, unicode):
            language = cld.detect(text.encode('utf-8'))[1]
        else:
            language = cld.detect(text)[1]

        return {'text': text, 'file_metadata': metadata, 'language': language,
                'mimetype': file_mime_type, 'forced_decoding': forced_decoding}

Ejemplo n.º 15

0

Mostrar archivo

Archivo: _language.py Proyecto: nithintumma/streamcorpus-pipeline

    def _language(si, context):
        if si.body and si.body.raw:
            name, code, is_reliable, num_text_bytes, details = cld.detect(si.body.raw)
            if is_reliable and code != "xxx":
                si.body.language = Language(code=code, name=name)
            else:
                si.body.language = Language(code="", name="")

        elif si.body:
            ## no .body.raw -- rare, but not impossible
            si.body.language = Language(code="", name="")

        return si

Ejemplo n.º 16

0

Mostrar archivo

Archivo: test.py Proyecto: ericfischer/chromium-compact-language-detector

 def runOne(self, expectedLangName, s):
   if VERBOSE:
     print
     print 'Test: %s [%d bytes]' % (expectedLangName, len(s))
   detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect(s)
   if VERBOSE:
     print '  detected: %s' % detectedLangName
     print '  reliable: %s' % (isReliable != 0)
     print '  textBytes: %s' % textBytesFound
     print '  details: %s' % str(details)
     self.langsSeen.add(expectedLangName)
     print '  %d langs' % len(self.langsSeen)
   self.assertEquals(expectedLangName, detectedLangName)
   self.assertTrue(isReliable)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: get_files.py Proyecto: renefs87/Big-Data-Tweets-Analyser

def process_line(tweet_line):
    line = tweet_line.rstrip()
    tweet_array = [splits for splits in line.split("\t") if splits is not ""]

    tweet_id = tweet_array[0]
    #Sun Jan 23 00:04:13 +0000 2011
    tweet_date = tweet_array[1]
    tweet_content = tweet_array[4].lower()

    #aplicar cld para saber el idioma del tweet
    tweet_lang = cld.detect(tweet_content)[0]
    if tweet_lang == 'ENGLISH':
        process_english_tweet(tweet_content, tweet_date, tweet_id)
    return line

Ejemplo n.º 18

0

Mostrar archivo

def get_desc_from_folder(folder_path, desc_count=1000):
    name_desc_pairs = {}
    count = desc_count
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".json"):
                if len(name_desc_pairs) < count:
                    # FIXME: unicode error \xc3\xa2\xc2\x99\xc2\xa3
                    desc = get_desc(os.path.join(root, file))
                    desc_utf8 = desc.encode('utf-8')
                    if len(desc) > 1000:
                        lang = cld.detect(desc_utf8)
                        if lang[1] == 'en' and len(lang[4]) == 1:
                            name_desc_pairs[file] = desc
    return name_desc_pairs

Ejemplo n.º 19

0

Mostrar archivo

Archivo: port_to_mongo.py Proyecto: emanjavacas/twitproj

def handle_tweet(tweet):
    if 'langid_guess' not in tweet:
        tweet['langid_guess'] = langid.classify(tweet['text'])[0]
    if 'ldig_guess' not in tweet:
        tweet['ldig_guess'] = det.detect('model.latin', tweet['text'])[1]
    cld_guess = cld.detect(tweet['text'].encode('utf-8'))[1]
    guesses = {'langid_guess' : tweet['langid_guess'],
               'ldig_guess' : tweet['ldig_guess'],
               'twitter_guess' : tweet['lang'],
               'cld_guess' : cld_guess}
    del tweet['langid_guess']
    del tweet['ldig_guess']
    del tweet['lang']
    tweet['langs'] = guesses
    return tweet

Ejemplo n.º 20

0

Mostrar archivo

Archivo: identifiers.py Proyecto: kleem/wli

def chromium_cld(page):
    ''' Run the Chromium Compact Language Detector on the given page. '''
    # Python binding to C++
    data = cld.detect(page)
    
    if data[4] == []: # this could happen when CLD returns 'unknown'
        result = {'und': 1}
    else:
        result = dict([(isoify(l[1]), l[2]/100.0) for l in data[4]])
    
    return {
        'data': data,
        'result': result,
        'best': isoify(data[1]),
        'best_name': iso639_3_index[isoify(data[1])]['ref_name']
    }

Ejemplo n.º 21

0

Mostrar archivo

Archivo: extractTextFromUserTimeline.py Proyecto: anirudhreddy92/ae-lda

def process_tweet_xml_elm(tweet_xml_elm):
    tweet_text = etree.tostring(tweet_xml_elm.find("tweet_text"), method='text', encoding="UTF-8")
    tweet_text =  remove_unicode_chr(smart_unicode(tweet_text))

    tweet_id = tweet_xml_elm.find("tweet_id").text
    timestamp = tweet_xml_elm.find("timestamp").text
    created_at = tweet_xml_elm.find("created_at").text
    user_id = tweet_xml_elm.find("user_id").text

    langName, lang, isReliable, textBytesFound, details =  cld.detect(tweet_text, pickSummaryLanguage=True, removeWeakMatches=False)

    #STRIP_NON_EN:

    if (STRIP_NON_EN and lang != 'en') or (STRIP_RT and tweet_text.upper().startswith("RT")):
        return lang,None

    return lang,Tweet(tweet_id,user_id,timestamp,created_at,tweet_text)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: sentiment_analysis.py Proyecto: arianpasquali/twitter-scripts

def process(input_file):
    count = 0
    
    reader = unicode_csv_reader(open(input_file))
    for status in reader:
        count = count + 1
        
        status_text = status[_FIELDS.index("text")]
        
        tb  = TextBlob(status_text)
        lang_detected = cld.detect(status_text.encode("utf-8"))
    
        # consider only english tweets
        if(lang_detected[0] == "ENGLISH"):
            sentiment = decode_sentiment(tb.sentiment[0])
            
            print str(count), sentiment[1]," - " ,status_text.encode("utf-8")
            
            try:
                csvwriter.writerow([
                                    status[_FIELDS.index("id")],
                                    status[_FIELDS.index("created_at")],
                                    status[_FIELDS.index("user_id")],
                                    status[_FIELDS.index("user_screen_name")],
                                    status[_FIELDS.index("user_name")].encode("utf-8"),
                                    status[_FIELDS.index("user_description")].encode("utf-8"),
                                    status[_FIELDS.index("user_created_at")],
                                    status[_FIELDS.index("user_followers_count")],
                                    status[_FIELDS.index("user_friends_count")],
                                    status[_FIELDS.index("user_statuses_count")],
                                    status[_FIELDS.index("user_listed_count")],
                                    status[_FIELDS.index("text")].encode("utf-8"),
                                    status[_FIELDS.index("source")].encode("utf-8"),
                                    status[_FIELDS.index("retweet_count")],
                                    status[_FIELDS.index("place_id")],
                                    status[_FIELDS.index("place_type")],
                                    status[_FIELDS.index("place_country_code")],
                                    status[_FIELDS.index("place_country")].encode("utf-8"),
                                    status[_FIELDS.index("place_name")].encode("utf-8"),
                                    status[_FIELDS.index("place_fullname")].encode("utf-8"),
                                    sentiment[1],
                                    sentiment[0]
                                    ])
            except Exception as e:
                print "Exception ", e

Ejemplo n.º 23

0

Mostrar archivo

Archivo: cleanDb.py Proyecto: JozefStefanInstitute/newsfeed

def handleSingleLangdet_cld(cur, articleId, text, commit=True):
	"""
	Like handleSingleLangdet(), but using Google's CLD library.
	If `commit` is given, commits the transaction at the end.
	"""
	try: lc_alt = cld.detect(text.encode('utf8','replace'))[1]
	except:
		print repr(text)
		raise
	if lc_alt == 'un' or lc_alt == 'xxx':  # "un" means "unknown"
		lc_alt = lc_iso = None
	else:
		lc_iso = iso_map.iso2to3[lc_alt.split('-')[0]]
	debug('Article %d has language (%s / %s)', articleId, lc_iso, lc_alt)
			
	cur.execute("UPDATE feed_article_meta SET lang_iso=%s, lang_altcode=%s, lang_is_cld=%s WHERE id=%s", (lc_iso, lc_alt, True, articleId))
	if commit: cur.connection.commit()
	return (lc_iso, lc_alt)

Ejemplo n.º 24

0

Mostrar archivo

Archivo: _language.py Proyecto: naimdjon/streamcorpus-pipeline

    def __call__(self, si, context):
        if si.body and si.body.raw:
            name, code, is_reliable, num_text_bytes, details = cld.detect(si.body.raw)
            if is_reliable and code != 'xxx':
                si.body.language = Language(code=code, name=name)
            else:
                si.body.language = Language(code='', name='')

        elif si.body:
            ## no .body.raw -- rare, but not impossible
            si.body.language = Language(code='', name='')

        if 'force' in self.config:            
            si.body.language = Language(
                code=self.config['force'].get('code'), 
                name=self.config['force'].get('name'))

        return si

Ejemplo n.º 25

0

Mostrar archivo

Archivo: _language.py Proyecto: naimdjon/streamcorpus-pipeline

    def __call__(self, si, context):
        if si.body and si.body.raw:
            name, code, is_reliable, num_text_bytes, details = cld.detect(
                si.body.raw)
            if is_reliable and code != 'xxx':
                si.body.language = Language(code=code, name=name)
            else:
                si.body.language = Language(code='', name='')

        elif si.body:
            ## no .body.raw -- rare, but not impossible
            si.body.language = Language(code='', name='')

        if 'force' in self.config:
            si.body.language = Language(code=self.config['force'].get('code'),
                                        name=self.config['force'].get('name'))

        return si

Ejemplo n.º 26

0

Mostrar archivo

def handleSingleLangdet_cld(cur, articleId, text, commit=True):
    """
	Like handleSingleLangdet(), but using Google's CLD library.
	If `commit` is given, commits the transaction at the end.
	"""
    try:
        lc_alt = cld.detect(text.encode('utf8', 'replace'))[1]
    except:
        print repr(text)
        raise
    if lc_alt == 'un' or lc_alt == 'xxx':  # "un" means "unknown"
        lc_alt = lc_iso = None
    else:
        lc_iso = iso_map.iso2to3[lc_alt.split('-')[0]]
    debug('Article %d has language (%s / %s)', articleId, lc_iso, lc_alt)

    cur.execute(
        "UPDATE feed_article_meta SET lang_iso=%s, lang_altcode=%s, lang_is_cld=%s WHERE id=%s",
        (lc_iso, lc_alt, True, articleId))
    if commit: cur.connection.commit()
    return (lc_iso, lc_alt)

Ejemplo n.º 27

0

Mostrar archivo

Archivo: donation.py Proyecto: GamesDoneQuick/donation-tracker

  def clean(self,bid=None):
    super(Donation,self).clean()
    if self.domain == 'LOCAL': # local donations are always complete, duh
      if not self.donor:
        raise ValidationError('Local donations must have a donor')
      self.transactionstate = 'COMPLETED'
    if not self.donor and self.transactionstate != 'PENDING':
      raise ValidationError('Donation must have a donor when in a non-pending state')
    if not self.domainId and self.donor and self.timereceived:
      self.domainId = str(calendar.timegm(self.timereceived.timetuple())) + self.donor.email
    
    bids = set(self.bids.all())
    
    # because non-saved bids will not have an id, they are not hashable, so we have to special case them
    if bid:
      if not bid.id:
        bids = list(bids) + [bid]
      else:
        #N.B. the order here is very important, as we want the new copy of bid to override the old one (if present)
        bids = list(set([bid]) | bids)
        
    bids = map(lambda b: b.amount,bids)
    bidtotal = reduce(lambda a,b: a+b,bids,Decimal('0'))
    if self.amount and bidtotal > self.amount:
      raise ValidationError('Bid total is greater than donation amount: %s > %s' % (bidtotal,self.amount))

    tickets = self.tickets.all()
    ticketTotal = reduce(lambda a,b: a+b, map(lambda b: b.amount, tickets), Decimal('0'))
    if self.amount and ticketTotal > self.amount:
      raise ValidationError('Prize ticket total is greater than donation amount: %s > %s' % (ticketTotal,self.amount))

    if self.comment and cld:
      if self.commentlanguage == 'un' or self.commentlanguage == None:
        detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect(self.comment.encode('utf-8'), hintLanguageCode ='en')
        if detectedLangCode in map(lambda x: x[0], LanguageChoices):
          self.commentlanguage = detectedLangCode
        else:
          self.commentlanguage = 'un'
    else:
      self.commentlanguage = 'un'

Ejemplo n.º 28

0

Mostrar archivo

def process_tweet_xml_elm(tweet_xml_elm):
    tweet_text = etree.tostring(tweet_xml_elm.find("tweet_text"),
                                method='text',
                                encoding="UTF-8")
    tweet_text = remove_unicode_chr(smart_unicode(tweet_text))

    tweet_id = tweet_xml_elm.find("tweet_id").text
    timestamp = tweet_xml_elm.find("timestamp").text
    created_at = tweet_xml_elm.find("created_at").text
    user_id = tweet_xml_elm.find("user_id").text

    langName, lang, isReliable, textBytesFound, details = cld.detect(
        tweet_text, pickSummaryLanguage=True, removeWeakMatches=False)

    #STRIP_NON_EN:

    if (STRIP_NON_EN
            and lang != 'en') or (STRIP_RT
                                  and tweet_text.upper().startswith("RT")):
        return lang, None

    return lang, Tweet(tweet_id, user_id, timestamp, created_at, tweet_text)

Ejemplo n.º 29

0

Mostrar archivo

Archivo: main.py Proyecto: alcho/Slx7hS3ns3onLinux

def langDetect(s):
    import cld

    langsSeen = set()
    detLangsSeen = set()

    detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect(s, pickSummaryLanguage=True, removeWeakMatches=False)

    if DEBUG:
        log('CLD :')
        log('  detected: %s' % detectedLangName)
        log('  reliable: %s' % (isReliable != 0))
        log('  textBytes: %s' % textBytesFound)
        log('  details: %s' % str(details))
        for tup in details:
            detLangsSeen.add(tup[0])
        log('  %d langs; %d ever detected' % (len(langsSeen), len(detLangsSeen)))
        log("\n")

    if detectedLangName == 'Unknown':
        return 'Unknown'
    else:
        return [i[1] for i in details]

Ejemplo n.º 30

0

Mostrar archivo

Archivo: process.py Proyecto: deverant/cld_process

def filter(reader, out, meta_out):
    global counter_all
    counter_fi=0
    counter=0


    key_class = reader.getKeyClass()
    value_class = reader.getValueClass()

    key = key_class()
    value = value_class()

    #reader.sync(4042)
    position = reader.getPosition()
    while reader.next(key, value):
        k=key.toString()
        v=value.toString()
        position = reader.getPosition()
        #lang,w=langid.classify(v[:312])
        lang = cld.detect(v[:400])
        #iprint lang
        #if omor.is_fin_string(v):
        #pdb.set_trace()
        counter+=1
        if lang[0]=='FINNISH':
            counter_fi+=1
            out.write(magicStringB)
            out.write(k + '\n')
            out.write('<lang>\n' + str(lang) + '\n</lang>')
            out.write("\n")
            out.write(v)
            out.write(magicStringE)
        meta_out.write(k + '\n' + str(lang) + '\n')
    counter_all+=counter
    print >> sys.stderr, "Global counter", counter_all
    print >> sys.stderr, "Fin / All", counter_fi, counter

Ejemplo n.º 31

0

Mostrar archivo

Archivo: test.py Proyecto: Dictanova/cld

  def runOne(self, expectedLangName, s, shouldBeReliable=True):
    if VERBOSE:
      print
      print 'Test: %s [%d bytes]' % (expectedLangName, len(s))
    detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect(s, pickSummaryLanguage=True, removeWeakMatches=False)
    if VERBOSE:
      print '  detected: %s' % detectedLangName
      print '  reliable: %s' % (isReliable != 0)
      print '  textBytes: %s' % textBytesFound
      print '  details: %s' % str(details)
      self.langsSeen.add(expectedLangName)
      for tup in details:
        self.detLangsSeen.add(tup[0])
      print '  %d langs; %d ever detected' % (len(self.langsSeen), len(self.detLangsSeen))

      if False:
        if expectedLangName == 'YIDDISH':
          l = list(self.detLangsSeen)
          l.sort()
          for i, name in enumerate(l):
            print '  PyTuple_SET_ITEM(pyDetLangs, %d, PyString_FromString("%s"));' % (i, name)
        
    self.assertEquals(expectedLangName, detectedLangName, '%s != %s; details: %s' % (detectedLangName, expectedLangName, str(details)))
    self.assertTrue(not shouldBeReliable or isReliable)

Ejemplo n.º 32

0

Mostrar archivo

Archivo: langDetect.py Proyecto: gidim/BSCrawler

import cld
import os
import os
from fnmatch import fnmatch
import sys




#open the temp file java generated.
with open (sys.argv[1], "r") as myfile:
    data=myfile.read().replace('\n', '')

detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect(str(data), pickSummaryLanguage=False, removeWeakMatches=False)
print '  lang: %s ,reliable: %i' % (detectedLangCode,isReliable)

Ejemplo n.º 33

0

Mostrar archivo

def detect_articles_language():
    for x in Article.objects.all().iterator():
        x.language = cld.detect(x.excerpt.encode('ascii', 'ignore'))[1]
        x.save()

Ejemplo n.º 34

0

Mostrar archivo

aids = defaultdict(list)  # lang group -> list of article ids.
done_hosts = set()

for lg in LANG_GROUPS:
	f = open('./evaluation/compare_%s.html' % lg,'w')
	f.write('<style>table{table-layout:fixed}td{vertical-align:top;border:solid #ccc 1px;border-top:solid black 2px;}.tiny{font-size:70%; width:45%}</style>')
	f.write('<table><tr><th>Article info<th>diff(old Tomaz, new Tomaz)<th class="tiny">diff(Mitja, new Tomaz)')
	f.close()

for i,row in enumerate(cur):
	url = row['url']
	host = url.split('/')[2]
	if host in done_hosts: continue
	done_hosts.add(host)
	print i, row['id'], host
	lang = cld.detect(row['content'])[1].split('-')[0]

	if lang == 'un':
		continue
	elif lang == 'en':
		lang_group = 'english'
	elif lang in 'ja ko zh hi ms ml te ta jw oc ur gu th kn pa fa km'.split():
		lang_group = 'syllabary'
	else:
		lang_group = 'alphabet'

	if len(aids[lang_group])>=50: continue
	aids[lang_group].append(row['id'])
		
	try: html = str(row['content']).decode('utf8')
	except: continue

Ejemplo n.º 35

0

Mostrar archivo

Archivo: cluster_desc.py Proyecto: sangheestyle/stools

from stools.wordcloud.wordcloud import make_wordcloud

folder_path = "data_google_play"
json_names = [['extendedInfo', 'description']]
if not os.path.exists(folder_path):
    call(["git", "clone", "https://github.com/sangheestyle/data_google_play.git"])

json_contents, file_names = stio.read_json_folder(folder_path, '.json', json_names)
descriptions = zip(*json_contents)[0]

# The number of input files is 1365, but the number of result of apk_info
# is 971 due to flitering by length of description and language (english only)
apk_info = []
for idx, desc in enumerate(descriptions):
    if len(desc) > 1000:
        lang = cld.detect(desc.encode('utf-8'))
        if lang[1] == 'en' and len(lang[4]) == 1:
            apk_info.append([file_names[idx], desc.encode('ascii', errors='ignore')])

filtered_desc = zip(*apk_info)[1]
stemmed_list = nlp.trs(filtered_desc, "snowball")
dictionary = nlp.dictionary(stemmed_list)
corpus = nlp.corpus(stemmed_list, dictionary)
corpus_lda = nlp.lda(corpus, dictionary, num_topics=5)

vectors = [array(f) for f in corpus_lda]
prediction = ml.KMeans(vectors, n_clusters=5, max_iter=1000)
for idx, item in enumerate(apk_info):
    apk_name = item[0]
    stemmed_content = ' '.join(stemmed_list[idx])
    groupID = prediction[idx]

Ejemplo n.º 36

0

Mostrar archivo

import cld

conn, cur = openConnection()
cur = conn.cursor('x')
cur.execute(
    "SELECT m.id, p.content, m.lang_altcode FROM processed_article p JOIN feed_article_meta m ON (p.feed_articleid = m.id) WHERE p.mode='cleartext' ORDER BY m.id DESC LIMIT 100000"
)

cnt = {}
cnt2 = {}
while True:
    row = cur.fetchone()
    if not row: break
    aid, txt, lang = row
    lang = str(lang[:2])
    lang2 = cld.detect(txt.encode('utf8', 'ignore'))[1]
    cnt[lang] = cnt.get(lang, 0) + 1
    cnt2[lang2] = cnt2.get(lang2, 0) + 1
    print 'done', sum(cnt.itervalues())

print 'done'


def top(d, n=60):
    for pair in sorted(d.iteritems(), key=lambda pair: -pair[1])[:n]:
        print '%s %5d' % pair


print 'DATABASE SAYS:'
top(cnt)
print '\nCLD SAYS:'

Ejemplo n.º 37

0

Mostrar archivo

Archivo: tasks.py Proyecto: 7achilles7/newsreader

def detect_articles_language():
    for x in Article.objects.all().iterator():
        x.language = cld.detect(x.excerpt.encode('ascii', 'ignore'))[1]
        x.save()

Ejemplo n.º 38

0

Mostrar archivo

Archivo: language.py Proyecto: kranthik123/ocwc-data

    def update_all(self):
    	for course in Course.objects.filter(language__iexact="english"):
                topLanguageName, topLanguageCode, isReliable, textBytesFound, details = cld.detect(course.title.encode('utf-8') + " " + course.description.encode('utf-8'))
                if topLanguageCode not in  ['en','un']:
                    # force unidentified languages into English
                    if topLanguageCode == 'un':
                        topLanguageName = 'English'

                    language = topLanguageName.lower().capitalize()
                    if language in LANG_MAPPING:
                        course.language = language
                        course.save()
                        continue
                    else:
                    	self.stdout.write("New language %s %s (reliable: %s)" % (topLanguageName, topLanguageCode, isReliable))

Ejemplo n.º 39

0

Mostrar archivo

Archivo: evalEuroparl.py Proyecto: pombredanne/luceneutil

 def detect(self, utf8):
   name, code, reliable, numBytes, details = cld.detect(utf8, isPlainText=True, removeWeakMatches=False, pickSummaryLanguage=False)
   for tup in details:
     self.allLangs.add(tup[0])
   return code, reliable

Ejemplo n.º 40

0

Mostrar archivo

def get_language(text):
    """Returns the language of a given text as a tuple like
    (LANGUAGE, language-code)"""
    return cld.detect(text)[:2]

Ejemplo n.º 41

0

Mostrar archivo

Archivo: example_chromium.py Proyecto: Stophface/thesis

inputDB = "myDB"

connector = sqlite3.connect(inputDB)
selecter = connector.cursor()


##########
#ALTER TABLE FOR A COLUMN FOR THE LANGUAGE
selecter.execute('''ALTER TABLE myDB ADD COLUMN "description_abbreviation_chrome" 'TEXT' ''')
selecter.execute('''ALTER TABLE myDB ADD COLUMN "description_language_chrome" 'TEXT' ''')
selecter.execute('''ALTER TABLE myDB ADD COLUMN "description_reliability_chrome" 'TEXT' ''')
##########


selecter.execute('''SELECT id_db, description FROM myDB''')
for row in selecter:
    #print (row)
    a = ''.join(row[1])
    goog = detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect("{}".format(a)) #chrome languagedetection
    lancode = detectedLangCode
    lanname = detectedLangName
    lanreliability = isReliable != 0
    landetails = details
    #print ("lancode: ",lancode)
    #print ("lanname: ", lanname)
    #print ("lanreliability: ", lanreliability)
    print (row[0])
    #print (a) 
    connector.execute('''update myDB set description_abbreviation_chrome = ? , description_language_chrome = ? , description_reliability_chrome = ? where id_db == ?''', (lancode, lanname, lanreliability, row[0]))  
connector.commit() #save changes
connector.close()

Ejemplo n.º 42

0

Mostrar archivo

def get_full_article(this_item, feed_id, **kwargs):
    if len(this_item['link']) > 200:
        this_item['link'] = short.shorten(this_item['link'])['url']
    if any(required not in this_item for required in ['title', 'link']):
        return
    try:
        Article.objects.values('id').get(
            Q(feed_id=feed_id, url=this_item['link'])
            | Q(feed_id=feed_id, title=this_item['title']))
        return
    except Article.DoesNotExist:
        pass
    except Article.MultipleObjectsReturned:
        return
    published_parsed = datetime.utcnow().replace(tzinfo=utc)
    if 'updated_parsed' not in this_item:
        if 'published_parsed' in this_item and datetime.utcfromtimestamp(
                mktime(this_item['published_parsed'])).replace(
                    tzinfo=utc) < published_parsed:
            published_parsed = datetime.utcfromtimestamp(
                mktime(this_item['published_parsed'])).replace(tzinfo=utc)
    elif datetime.utcfromtimestamp(mktime(
            this_item['updated_parsed'])).replace(
                tzinfo=utc) < published_parsed:
        published_parsed = datetime.utcfromtimestamp(
            mktime(this_item['updated_parsed'])).replace(tzinfo=utc)
    if 'author' not in this_item:
        this_item['author'] = None
    if 'description' not in this_item:
        this_item['description'] = ''
    if len(this_item['title']) > 200:
        this_item['title'] = this_item['title'][:180] + '...'
    res = process_article(this_item['description'])
    this_item['excerpt'] = res['excerpt']
    this_item['word_count'] = res['word_count']
    this_item['description'] = res['content']
    media = res['image']
    full = None
    if not media:
        if 'media_content' in this_item and 'url' in this_item[
                'media_content'][0]:
            media = this_item['media_content'][0]['url']
        else:
            full = get_article_readability(this_item)
            if full:
                res = full
                media = res['lead_image_url']
    if len(this_item['excerpt']) == 0:
        this_item['language'] = cld.detect(this_item['excerpt'].encode(
            'ascii', 'ignore'))[1]
        if this_item['language'] == 'un':
            this_item['language'] = cld.detect(this_item['title'].encode(
                'ascii', 'ignore'))[1]
    else:
        this_item['language'] = cld.detect(this_item['title'].encode(
            'ascii', 'ignore'))[1]

    if kwargs.get('summarize_excerpt'):
        extend = {'content_ex': None, 'summary': None}
        try:
            with tempfile.NamedTemporaryFile() as tmp:
                tmp_path = tmp.name
                tmp.write(
                    (strip_tags(res['content'].decode('ascii',
                                                      'ignore'))).strip())
                tmp.flush()
                extend['summary'] = subprocess.check_output(
                    ['ots', tmp_path]).strip().splitlines().pop().strip()
        except Exception:
            pass
    else:
        extend = extend_article(full, this_item['link'], **kwargs)

    obj, created = Article.objects.get_or_create(feed_id=feed_id,
                                                 url=this_item['link'],
                                                 defaults={
                                                     'title':
                                                     this_item['title'],
                                                     'content':
                                                     this_item['description'],
                                                     'word_count':
                                                     res['word_count'],
                                                     'url':
                                                     this_item['link'],
                                                     'media':
                                                     media,
                                                     'date_parsed':
                                                     published_parsed,
                                                     'author':
                                                     this_item['author'],
                                                     'excerpt':
                                                     this_item['excerpt'],
                                                     'language':
                                                     this_item['language'],
                                                     'summary':
                                                     extend['summary'],
                                                     'content_ex':
                                                     extend['content_ex']
                                                 })
    if created:
        get_article_info(obj)

Ejemplo n.º 43

0

Mostrar archivo

    annotations_table, spotlight_table = sql_convenience.create_all_tables(args.keyword)
    tweets = tweet_generators.get_tweets(open(args.tweet_file))

    # we can skip through Tweets we've already seen in the same file by
    # specifying a tweet id to jump to
    if args.skipto is not None:
        for tweet in tweets:
            if tweet['id'] == args.skipto:
                break  # continue after this tweet

    for tweet in tweets:
        tweet_text = unicode(tweet['text'])
        annotate = True
        # determine if this is an English tweet or not
        tweet_text_bytesutf8 = tweet_text.encode('utf-8')
        language_name, language_code, is_reliable, text_bytes_found, details = cld.detect(tweet_text_bytesutf8)
        # example: ('SPANISH', 'es', True, 69, [('SPANISH', 'es', 100, 93.45794392523365)])
        print("---")
        print(language_name, language_code, is_reliable)
        if language_code not in set(["en", "un"]):
            annotate = False

        tweet_id = tweet['id']
        if sql_convenience.check_if_tweet_exists(tweet_id, annotations_table) == 0:
            # check our keyword is present as Twitter can provide tweets 'relevant
            # to your keyword' which don't actually contain the keyword (but it
            # might be linked in a t.co title or body text)
            nbr_keywords = tweet_text.lower().count(args.keyword)
            nbr_keywords_hash = tweet_text.lower().count("#" + args.keyword)
            print(nbr_keywords, nbr_keywords_hash)
            if nbr_keywords == nbr_keywords_hash:

Ejemplo n.º 44

0

Mostrar archivo

    def clean(self, bid=None):
        super(Donation, self).clean()
        if self.domain == 'LOCAL':  # local donations are always complete, duh
            if not self.donor:
                raise ValidationError('Local donations must have a donor')
            self.transacationstate = 'COMPLETED'
        if not self.donor and self.transactionstate != 'PENDING':
            raise ValidationError(
                'Donation must have a donor when in a non-pending state')
        if not self.domainId and self.donor and self.timereceived:
            self.domainId = str(calendar.timegm(
                self.timereceived.timetuple())) + self.donor.email

        bids = set(self.bids.all())

        # because non-saved bids will not have an id, they are not hashable, so we have to special case them
        if bid:
            if not bid.id:
                bids = list(bids) + [bid]
            else:
                #N.B. the order here is very important, as we want the new copy of bid to override the old one (if present)
                bids = list(set([bid]) | bids)

        bids = map(lambda b: b.amount, bids)
        bidtotal = reduce(lambda a, b: a + b, bids, Decimal('0'))
        if self.amount and bidtotal > self.amount:
            raise ValidationError(
                'Bid total is greater than donation amount: %s > %s' %
                (bidtotal, self.amount))

        tickets = self.tickets.all()
        ticketTotal = reduce(lambda a, b: a + b,
                             map(lambda b: b.amount, tickets), Decimal('0'))
        if self.amount and ticketTotal > self.amount:
            raise ValidationError(
                'Prize ticket total is greater than donation amount: %s > %s' %
                (ticketTotal, self.amount))

        if self.comment and cld:
            if self.commentlanguage == 'un' or self.commentlanguage == None:
                detectedLangName, detectedLangCode, isReliable, textBytesFound, details = cld.detect(
                    self.comment.encode('utf-8'), hintLanguageCode='en')
                if detectedLangCode in map(lambda x: x[0], LanguageChoices):
                    self.commentlanguage = detectedLangCode
                else:
                    self.commentlanguage = 'un'
        else:
            self.commentlanguage = 'un'

Ejemplo n.º 45

0

Mostrar archivo

Archivo: realtime_twitter_lang_detection.py Proyecto: sronen/twitter-stream-lang-detection

from collections import Counter
import cld, tweetstream

if __name__ == "__main__":
	langs = Counter()
	while True:
		try:
			with tweetstream.FilterStream("jcodameta", "T3stcct!", track=["MediaLabIO"]) as stream:
				for	 tweet in stream:
					tweet_text = tweet.get("text")
					try:
						lang = cld.detect(tweet_text.encode('utf-8'))[0]
						langs[lang] += 1
						print langs, tweet_text
					except:
						print "error1"
						continue
		except:
			print "error2"
			continue

Ejemplo n.º 46

0

Mostrar archivo

Archivo: compact_language_detector.py Proyecto: yuandra/scraperwiki-scraper-vault

import scraperwiki
import cld

text = []
text.append('hello over there, i am from Moscow')
text.append('Привет всем, я алкоголик из Москвы')
text.append('Hola a todos, soy un alcohólico de Moscú')

topLanguageName = []
lngCode = []

for x in range(len(text)):

    topLanguageName.append(cld.detect(text[x])[0])
    lngCode.append(cld.detect(text[x])[1])
    print topLanguageName[x]

import scraperwiki
import cld

text = []
text.append('hello over there, i am from Moscow')
text.append('Привет всем, я алкоголик из Москвы')
text.append('Hola a todos, soy un alcohólico de Moscú')

topLanguageName = []
lngCode = []

for x in range(len(text)):

    topLanguageName.append(cld.detect(text[x])[0])

Ejemplo n.º 47

0

Mostrar archivo

import cld

text_it = "Wales lancia la Wikipedia delle news. Contro il fake in campo anche Google"
text_en = "Cassini Spacecraft Re-Establishes Contact After 'Dive' Between Saturn And Its Rings"

lang_it = cld.detect(text_it)
lang_en = cld.detect(text_en)

print(text_it, "is in", lang_it)
print(text_en, "is in", lang_en)

Ejemplo n.º 48

0

Mostrar archivo

Archivo: engfilter.py Proyecto: ejamesc/sgbeat-scraper

                database = "jb",
                user = MYSQL_USER_NAME,
                password = MYSQL_PASSWORD
               )
db2 = Connection(host = HOST_NAME,
                 database = "jb_pure",
                 user = "******",
                 password = "******" 
                )

tweets = db.query("SELECT * FROM tweets")
count = 0
for t in tweets:
    c = t["tweet"].encode('utf-8')
    # language detection
    name, code, reliable, bytes_found, details = cld.detect(c)
    # compile a regex for urls. We don't want tweets with urls
    r = re.compile(r"(http://[^ ]+)")
    urlmatch = r.search(c)
    # we use a set to save tweets, and check against that to prevent duplicates
    saved = set()
    if (code == "en" or code == "un") and not urlmatch and c not in saved:
        # we allow 'unknown' languages into our database, as these are mostly short singlish sentences
        db2.execute("INSERT INTO tweets (user, tweet, location) VALUES (%s, %s, %s)", t["user"], c, t["location"])
        saved.add(c)
    else:
        print "Not English: " + c + " lang:" + name
        count = count + 1

db.close()
db2.close()

Ejemplo n.º 49

0

Mostrar archivo

Archivo: tasks.py Proyecto: 7achilles7/newsreader

def get_full_article(this_item, feed_id, **kwargs):
    if len(this_item['link']) > 200:
        this_item['link'] = short.shorten(this_item['link'])['url']
    if any(required not in this_item for required in ['title', 'link']):
        return
    try:
        Article.objects.values('id').get(Q(feed_id=feed_id, url=this_item['link']) |
                                         Q(feed_id=feed_id, title=this_item['title']))
        return
    except Article.DoesNotExist:
        pass
    except Article.MultipleObjectsReturned:
        return
    published_parsed = datetime.utcnow().replace(tzinfo=utc)
    if 'updated_parsed' not in this_item:
        if 'published_parsed' in this_item and datetime.utcfromtimestamp(
                mktime(this_item['published_parsed'])).replace(tzinfo=utc) < published_parsed:
            published_parsed = datetime.utcfromtimestamp(mktime(this_item['published_parsed'])).replace(tzinfo=utc)
    elif datetime.utcfromtimestamp(mktime(this_item['updated_parsed'])).replace(tzinfo=utc) < published_parsed:
        published_parsed = datetime.utcfromtimestamp(mktime(this_item['updated_parsed'])).replace(tzinfo=utc)
    if 'author' not in this_item:
        this_item['author'] = None
    if 'description' not in this_item:
        this_item['description'] = ''
    if len(this_item['title']) > 200:
        this_item['title'] = this_item['title'][:180] + '...'
    res = process_article(this_item['description'])
    this_item['excerpt'] = res['excerpt']
    this_item['word_count'] = res['word_count']
    this_item['description'] = res['content']
    media = res['image']
    full = None
    if not media:
        if 'media_content' in this_item and 'url' in this_item['media_content'][0]:
            media = this_item['media_content'][0]['url']
        else:
            full = get_article_readability(this_item)
            if full:
                res = full
                media = res['lead_image_url']
    if len(this_item['excerpt']) == 0:
        this_item['language'] = cld.detect(this_item['excerpt'].encode('ascii', 'ignore'))[1]
        if this_item['language'] == 'un':
            this_item['language'] = cld.detect(this_item['title'].encode('ascii', 'ignore'))[1]
    else:
        this_item['language'] = cld.detect(this_item['title'].encode('ascii', 'ignore'))[1]

    if kwargs.get('summarize_excerpt'):
        extend = {
            'content_ex': None,
            'summary': None
        }
        try:
            with tempfile.NamedTemporaryFile() as tmp:
                    tmp_path = tmp.name
                    tmp.write((strip_tags(res['content'].decode('ascii', 'ignore'))).strip())
                    tmp.flush()
                    extend['summary'] = subprocess.check_output(['ots', tmp_path]).strip().splitlines().pop().strip()
        except Exception:
            pass
    else:
        extend = extend_article(full, this_item['link'], **kwargs)

    obj, created = Article.objects.get_or_create(
        feed_id=feed_id, url=this_item['link'],
        defaults={'title': this_item['title'], 'content': this_item['description'],
                  'word_count': res['word_count'], 'url': this_item['link'], 'media': media,
                  'date_parsed': published_parsed, 'author': this_item['author'], 'excerpt': this_item['excerpt'],
                  'language': this_item['language'], 'summary': extend['summary'], 'content_ex': extend['content_ex']})
    if created:
        get_article_info(obj)