def add_lang(item): """Guess language for item""" lang = guess_language.guessLanguage(item['institution_description']) if lang == 'UNKNOWN': lang = guess_language.guessLanguage(item['job_functions_list']) if lang == 'UNKNOWN': logging.info('No language found for item %s' % item['phid']) return {} item['lang'] = lang return item
def index(page = 1): form = PostForm() if form.validate_on_submit(): language = guessLanguage(form.post.data) if language == 'UNKNOWN' or len(lanaguage) >5: language = '' post = Post(body = form.post.data, timestamp = datetime.utcnow(), author = g.user, language = language) db.session.add(post) db.session.commit() flash(gettext('Your post is now live!')) return redirect(url_for('index')) posts = g.user.followed_posts().paginate(page, POST_PER_PAGE, False) return render_template("index.html", title = "Home", form = form, posts = posts)
def post_prayer(): form = PostForm() group_forms = [(group, GroupPost(prefix = str(group.id))) for group in g.user.groups] if form.validate_on_submit(): language = guessLanguage(form.post.data) if language == 'UNKNOWN' or len(language) > 5: language = '' post = Post(subject = form.subject.data, body = form.post.data, timestamp = datetime.utcnow(), author = g.user, language = language, public = form.public.data) db.session.add(post) db.session.commit() if not post.public: # Filter only group that were selected. add_groups = filter(lambda g: g[1].group_access.data == True, group_forms) for group in add_groups: group[0].add_post(post) db.session.add(group[0]) db.session.commit() flash(gettext('Your post is now live!')) return redirect(url_for('post', id = post.id)) return render_template('post_form.html', title = 'Post Prayer', form = form, group_forms = group_forms)
def edit_post(id, page = 1): post = Post.query.get(id) if post == None: flash('Post not found.') return redirect(url_for('index')) if post.author.id != g.user.id: flash('You cannot edit this post.') return redirect(url_for('index')) form = PostForm() if form.validate_on_submit(): language = guessLanguage(form.post.data) if language == 'UNKNOWN' or len(language) > 5: language = '' post.subject = form.subject.data post.body = form.post.data post.language = language db.session.add(post) db.session.commit() flash(gettext('Your post has been updated!')) return redirect(url_for('index')) elif request.method != "POST": form.subject.data = post.subject form.post.data = post.body comments = post.comments.order_by(Comment.timestamp.desc()).paginate(page, POSTS_PER_PAGE, False) return render_template('edit_post.html', post = post, form = form, comments = comments)
def pack(args): if args.type == 'unknown': print >> sys.stderr, "guess mime type from file" mimetype = mimetypes.guess_type(args.input.name, strict=False)[0] if mimetype is not None: args.type = mimetype if args.name is None: args.name = args.input.name if args.input.name != "<stdin>" else "" data = args.input.read() if args.type == "text/plain": print >> sys.stderr, "text/plain ==> urn:nfc:wkt:T" try: from guess_language import guessLanguage print >> sys.stderr, "guess language from text" language = guessLanguage(data) if language == "UNKNOWN": language = "en" except ImportError: language = "en" print >> sys.stderr, "text language is '%s'" % language record = nfc.ndef.TextRecord(data, language=language) record.name = args.name else: print >> sys.stderr, "mime type is %s" % args.type record = nfc.ndef.Record(args.type, args.name, data) message = nfc.ndef.Message(record) if args.outfile.name == "<stdout>": args.outfile.write(str(message).encode("hex")) else: args.outfile.write(str(message))
def index(page=1): # user = g.user # pass g.user to template, not fake user form = PostForm() if form.validate_on_submit(): # if the form is submitted, insert new Post record into db. language = guessLanguage(form.post.data) if language == 'UNKNOWN' or len(language) > 5: language = '' # empty string signals unknown lang post = Post(body = form.post.data, timestamp = datetime.utcnow(), author=g.user, language = language) db.session.add(post) db.session.commit() flash("Your post is now live!") return redirect(url_for('index')) # could have skipped the redirect and allowed funct to continue down into rendering. So why redir? # If user hits refresh, browser would resend last issued request - resulting in a double post. # redir forces browser to issue another request after submision, so get the home page, not resubmit. #posts = g.user.followed_posts().all() # returns sqlalch query obj, config'd to grab posts we are interested in. # .all() puts all the posts into a list posts = g.user.followed_posts().paginate(page,POSTS_PER_PAGE,False) # paginate(starting page #, num items/page, error flag).items attribute of pagination obj is a list of items return render_template('index.html', title="Home", form=form, posts=posts)
def index(page = 1): # user = {'nickname': 'Ang'} # fake user form = PostForm() if form.validate_on_submit(): language = guessLanguage(form.post.data) if language == 'UNKNOWN' or len(language) > 5: language = '' post = Post(body = form.post.data, timestamp = datetime.utcnow(), author= g.user, language = language) db.session.add(post) db.session.commit() flash('Your post is now live!') return redirect(url_for('index')) # posts = g.user.followed_posts().all() posts = g.user.followed_posts().paginate(page, POSTS_PER_PAGE, False) # posts = [ # fake array of posts # { # 'author' : {'nickname' : 'Ang'}, # 'body' : 'Beautiful day in Ireland!' # }, # { # 'author' : {'nickname' : 'Gao'}, # 'body' : 'Hacking a blog' # } # ] return render_template('index.html', title = 'Home', form = form, posts=posts)
def event_analysis_fulfill_corpus(event_analysis, websites, description_tree_tagger, website_tree_tagger, events): """ Part 1 of the event analysis, that fulfill the corpus """ tagger = TreeTagger() # We complete the corpus with plain text of description & website if exists for e in events: len_description = 0 if e.description != '' and guess_language.guessLanguage(e.description.encode('utf-8')) == LANGUAGE_FOR_TEXT_ANALYSIS: event_analysis.add_document_in_corpus(e.description, EventAnalysis.get_id_website(e.id, False)) description_tree_tagger[e.id] = tagger.tag_text(e.description, FILTER_TREE_TAGGER) len_description = len(description_tree_tagger[e.id]) if e.website != '' and len_description < is_nb_word_website_enough(len_description): try: unique_urls = HashTableUrl() TreeNode(e.website.encode('utf-8'), DEFAULT_RECURSION_WEBSITE, unique_urls) websites[e.website] = '' for w in unique_urls.get_urls(): websites[e.website] += event_website_parser(w) + ' ' event_analysis.add_document_in_corpus(websites[e.website], EventAnalysis.get_id_website(e.id, True)) website_tree_tagger[e.id] = tagger.tag_text(websites[e.website], FILTER_TREE_TAGGER) # We empty the buffer, to save memory and because we only need it afterwards the url websites[e.website] = ' ' # Some website : # - has a 403 error, eg: complexe3d.com, # - is nonexistent website like http://www.biblio.morges.ch # - is not a web url ... like [email protected], # thhp://www.vitromusee.ch (the typo is on purpose !), www,chateaudeprangins.ch, http:// except (HTTPError, URLError, ValueError) as e: # We must know the other kind of error as conversion problem pass
def filter_english_tweets(df): #REMOVE NON LATIN LANGUAGES print 'Removing Non Latin Languages' temp = [] for i,tweet in enumerate(df.tweet): try: if unicode(tweet,'utf8')==tweet: temp.append(True) else: temp.append(False) except ValueError: temp.append(False) #GUESS LANGUAGE print 'Guessing Language' data = df[temp] temp = [] for x in data['tweet']: try: temp.append(guessLanguage(x)=='en') except Exception: temp.append(False) data = data[temp] data.index = range(data.shape[0]) return data
def language(text): '''Uses guess-language. This is the speed bottleneck for the program so we might want to lose it. Also, it mistakes most English comments for other languages, so I'm not sure it's earning its keep. However, I haven't noticed it identifying things as English that weren't, so perhaps it should stay.''' return guess_language.guessLanguage(text)
def _calculate_score(trend, entry): """Calculate a score for the given trend and feed entry. The current naive implementation works by determining the number of occurrences of the trend in the entry title and summary. A score of 0 indicates that the entry is not relevant to the trend. Args: trend: the trend to calculate for. entry: the feed entry to calculate a score for. """ regex = re.compile(r'\b%s\b' % trend, re.IGNORECASE) count = len(regex.findall(entry.get('title', ''))) count += len(regex.findall(entry.get('summary', ''))) if count == 0: return 0 # Filter out content that is not in English soup = BeautifulSoup(entry.get('summary', '')) summary = ''.join(soup.find_all(text=True)) language = guess_language.guessLanguage(summary) if language != 'en': return 0 return count
def index(page=1): # user = {'nickname': 'Snow'} # fake user #user = g.user move to before_request form = PostForm() if form.validate_on_submit(): language = guessLanguage(form.post.data) if language == 'UNKNOWN' or len(language) > 5: language = '' post = Post(body=form.post.data, timestamp=datetime.utcnow(), author=g.user, language=language) db.session.add(post) db.session.commit() flash('Your post is now live!') return redirect(url_for('index')) # posts = [ # fake blog # { # 'author': {'nickname': u'律香川'}, # 'body': u'蛋炒饭要么?' # }, # { # 'author': {'nickname': u'郭大路'}, # 'body': u'你太像个女孩了!' # } # ] # posts = g.user.followed_posts().all() posts = g.user.followed_posts().paginate(page, POSTS_PER_PAGE, False) return render_template('index.html', title='Home', form=form, posts=posts)
def user(nickname, page=1): form = PostForm() if form.validate_on_submit(): language = guessLanguage(form.post.data) if language == 'UNKNOWN' or len(language) > 5: language = '' post = Post(body=form.post.data, timestamp=datetime.utcnow(), parent=g.user.key, author=g.user.key, language=language) post.put() flash(gettext('Your post is now live!')) return redirect(url_for('user', nickname=nickname)) key = User.query(User.nickname == nickname) user = key.get() if user == None: flash(gettext('User %(nickname)s not found.', nickname = nickname)) return redirect(url_for('index')) qry = Post.query(ancestor=user.key).order(-Post.timestamp) posts = qry.map(callback) return render_template('user.html', user=user, form=form, posts=posts)
def upgrade(request): doc_list = request.db.docs for doc in doc_list.find(): if "version" not in doc: doc["version"] = 1 doc["created"] = datetime.utcnow() doc_list.save(doc) for doc in doc_list.find({"version": 1}): if "searchable_text" not in doc.keys(): doc["version"] = 2 doc_list.save(doc) continue searchable_text = doc.pop("searchable_text") lang = guessLanguage(searchable_text) search_terms = index(searchable_text + " " + doc["title"], [lang]) doc["search_terms"] = search_terms doc["language"] = lang doc["version"] = 2 doc_list.save(doc) for doc in doc_list.find({"version": 2}): doc["version"] = 3 doc["search_terms"] = [x.lower() for x in doc["search_terms"]] doc_list.save(doc) for doc in doc_list.find({"version": 3}): doc["version"] = 4 doc["keywords"] = [] doc_list.save(doc) for doc in doc_list.find({"version": 4}): doc["version"] = 5 doc["already_scanned"] = True doc_list.save(doc) return {"success": 1}
def file_compare(infile): with open(infile) as fp: for line in fp: line = line.strip() lang = get_text(line) lang = guessLanguage(lang) print line, lang
def __getattr__(self, name): # handle and cache calculated properties if name not in self.__dict__ or not self.__dict__[name]: if name == 'raw': return self._getraw() # cached on fs if name == 'text': self.__dict__['text'] = self._gettext() # cached in extfields if name == 'tokens': self.__dict__['tokens'] = self._gettokens() # cached in extfields if name == 'stems': self.__dict__['stems'] = self._getstems() # cached in extfields if name == 'termcnt': self.__dict__['termcnt']=self._getstemcount() if name == 'tfidf': self.__dict__['tfidf']=self._gettfidf() if name == 'title': self.__dict__['title']=self.docid if name == 'frags': return self._getfrags() # not cached at all if name == 'lang' and 'lang' not in self.__dict__.keys(): return guessLanguage(" ".join(self._gettext())) if name == 'body': return self._getbody() # not cached if name in self.metafields: return '' if name in self.__dict__.keys(): return self.__dict__[name] else: raise AttributeError, name
def __init__(self,raw=None,docid=None,oid=None,d=None): if oid: # get by mongo oid d=Docs.find_one({"_id": oid}) elif docid: # get by docid d=Docs.find_one({"docid": docid}) if d: # load the values self.__dict__.update(d) elif raw: # create a new document self.__dict__.update({ 'docid' : docid, 'pippies' : [], 'pippiDocs' : [], 'pippiDocsLen' : 0, 'rawid' : None, }) if not 'type' in self.__dict__: self.__dict__['type']='raw' if not 'metadata' in self.__dict__: self.__dict__['metadata']={} if raw: self.raw=raw self.lang=guessLanguage(" ".join(self.text)) self.save() else: raise KeyError('empty docid')
def isNotEnglish(desc): # Cyrillic characters if re.search(u'[\u0400-\u04FF]', desc): return True # Japanese characters if re.search(u'[\u3040-\u309F]', desc): return True if re.search(u'[\u30A0-\u30FF]', desc): return True if re.search(u'[\uFF00-\uFF9F]', desc): return True if re.search(u'[\u4E00-\u9FAF]', desc): return True # Chinese characters if re.search(u'[\u4E00-\u9FFF]', desc): return True if re.search(u'[\u3400-\u4DFF]', desc): return True if re.search(u'[\uF900-\uFAFF]', desc): return True # Korean characters if re.search(u'[\uAC00-\uD7AF]', desc): return True # Arabic characters if re.search(u'[\u0600-\u06FF]', desc): return True # Turkish characters if re.search(u'[ğüşöçİĞÜŞÖÇ]', desc): return True # Polish characters if re.search(u'[łśźżóń깣ŚŹŻÓŃĘĄ]', desc): return True # Use trigrams to detect language if not 'en' in lang.guessLanguage(desc): return True return False
def index(page=1): form = PostForm() if form.validate_on_submit(): language = guessLanguage(form.post.data) if language == 'UNKNOWN' or len(language) > 5: language = '' post = Post(body=form.post.data, timestamp=datetime.utcnow(), parent=g.user.key, author=g.user.key, language=language) post.put() flash(gettext('Your post is now live!')) return redirect(url_for('index')) #posts = g.user.followed_posts().paginate(page, POSTS_PER_PAGE, False) page,loc = serialize(Page) pages = { 'name': str(page.name), 'lat': loc.lat, 'lon': loc.lon } return render_template('index.html', title='Home', form=form, pages=pages)
def get_features(text): """ Getting features (lpist of string) from text. """ mapping = { 'ja': get_japanese_features, 'zh': get_japanese_features, # guess_language sometimes mis-recognises ja as zh. } return mapping.get(guessLanguage(text), get_english_features)(text)
def _desc(url, ie_key, title, info): desc_orig = desc = info.get('description', '').strip() or title desc = escape_wikitext(desc) if len(desc_orig) > 100: lang = guess_language.guessLanguage(desc_orig) if lang != 'UNKNOWN': desc = u'{{' + lang + u'|1=' + desc + u'}}' return desc
def run(self): while True: blog_url = self.entry_queue.get() try: print "BlogPageScraper working on: " + str(blog_url) blog = urllib2.urlopen('http://'+blog_url).read() soup = BeautifulSoup(blog) text_list = soup.findAll(text=True) end =0 start = 0 for i in range(len(text_list)): if text_list[i].find("Create an Account") != -1: start = i if text_list[i].find("Leave a comment") != -1: end = i break text = ''.join([text for text in text_list[start+1:end]]) lang = guess_language.guessLanguage(text) print "language: " + str(lang) emotion = None for row in soup(): found_mood,emotion = self.recursive_mood_find(row,False,emotion) if emotion != None: print " I have found the emotion: "+ unicode(emotion).encode('utf8') + "\n For the page: " + unicode(blog_url).encode('utf8') break if emotion != None and emotion != " " and emotion != "": self.lock.acquire() try: fname = self.filename +"_"+ lang f = codecs.open(fname,'a+',"utf-8") f2 = codecs.open(fname + "_emotions.txt",'a+',"utf-8") #print unicode(text).encode("utf8") f.write("\n\n") f.write("###BLOG_URL####" + blog_url.decode("utf8") + "#####") f.write("\n") f.write("#!#Emotion#!#"+emotion.decode("utf8")+ "#!#!#!#") f.write("\n") f.write(text) f.close() f2.write("\n\n") f2.write(blog_url.decode("utf8")) f2.write("\n") f2.write(emotion.decode("utf8")) f2.close() except: print "failed to write from: "+ unicode(blog_url).encode("utf-8") finally: self.lock.release() except: "got an error scraping page:" + unicode(blog_url).encode("utf-8")
def create_profile(sender, **kwargs): profile, new = UserProfile.objects.get_or_create(user=kwargs['instance']) #Figure out the correct format for the full name based on the language s = u'%s%s' % (profile.user.first_name, profile.user.last_name) lang = gl.guessLanguage(s) if lang == 'zh': profile.full_name = u'%s%s' % (profile.user.last_name, profile.user.first_name) else: profile.full_name = u'%s %s' % (profile.user.first_name, profile.user.last_name) profile.save()
def POST(self): """Detects the language of an input string :returns: a json with an information about the language of the input string or an empty dict """ response = {} text = self.request.POST.get('text') if text: response['language'] = guess_language.guessLanguage(text) return response
def handle_tweet_pos_tagged(channel, method, properties, body): try: body_json = json.loads(body) except: body_json = body tweet = body_json['tweet'] groups = body_json['groups'] #return #if tweet_json.get('retweeted_status') is not None: # return # text = tweet_json['text'] # text = clean_tweet(text) # if 'http' in text: # return # if len(text) < 100: # return if not is_ascii(tweet): return result = guess_language.guessLanguage(tweet) if result != 'en': return new_groups = [] remove_list = ['singapore'] for group in groups: word_list = group.split() new_group = ' '.join([i for i in word_list if i.lower() not in remove_list]) if new_group != '': new_groups.append(new_group) print(tweet) print(groups) print(new_groups) #tokenizer = Tokenizer(tweet_json['text']) #tokens = tokenizer.tokenize_as_tweet() #text_snippets = tokenizer.generate_candidate_strings(FILTER_KEYWORDS) place_finder = PlaceFinder() top_places = place_finder.match_text(new_groups, 1, 3) for place in top_places: print('TOP PLACE', place_finder.get_place(place[0])['name'], place[1]) #urls = tokenizer.get_urls() #for url in urls: # print(http_util.unshorten_url(url)) print('')
def on_status(self, status): try: text = status.text if guess_language.guessLanguage(text) == 'ja': status.created_at += timedelta(hours=9) print "-------------------" print "tweeted: " + str(status.created_at) print text + "\n" # col.insert({str(status.created_at): text}) except Exception, e: print >> sys.stderr, 'Encountered ::', e pass
def grep(self, request, response): """ Get the page indicated by the fuzzable_request and determine the language using the preposition list. :param request: The HTTP request object. :param response: The HTTP response object """ with self._plugin_lock: if not self._exec: return if not response.is_text_or_html(): return if is_404(response): return body = response.get_clear_text_body().lower() try: guessed_lang = guess_language.guessLanguage(body) except IndexError: # I don't care about exception handling of the external lib guessed_lang = 'UNKNOWN' if guessed_lang == 'UNKNOWN': # None means "I'm still trying" kb.kb.raw_write(self, 'lang', None) # Keep running until self._tries_left is zero self._tries_left -= 1 if self._tries_left == 0: msg = ('Could not determine the site language using the' ' first 25 HTTP responses, not enough text to make' ' a good analysis.') om.out.debug(msg) # unknown means I'll stop testing because I don't # have any idea about the target's language kb.kb.raw_write(self, 'lang', 'unknown') self._exec = False else: # Only run until we find the page language self._exec = False msg = 'The page is written in: "%s".' om.out.information(msg % guessed_lang) kb.kb.raw_write(self, 'lang', guessed_lang)
def recognize(filedata, accepted_languages, force_detection): with NamedTemporaryFile() as infile: infile.write(filedata) infile.file.flush() with NamedTemporaryFile() as textfile: retval = ocr(infile.name, textfile.name) img = imgopen(infile.name) if retval: detected_languages = [] lang = "UNKNOWN" else: lang = guessLanguage(textfile.read().decode('utf-8')) detected_languages = [lang] final_filename = infile.name + '-rotated' try: for rotation in (180, 90, 180, 0): if lang in accepted_languages: textfile.seek(0) return lang, img, textfile.read().decode('utf-8') img = img.rotate(rotation) img.save(final_filename, "JPEG") retval = ocr(final_filename, textfile.name) if retval: continue textfile.seek(0) lang = guessLanguage(textfile.read().decode('utf-8')) detected_languages.append(lang) finally: try: remove(final_filename) except OSError: pass if force_detection: raise TypeError("Languages %s not in range of accepted " "languages %s" % (str(detected_languages), str(accepted_languages))) return lang, img, textfile.read().decode('utf-8')
def guess_language(text): # pragma: no cover """Guess the language in which a body of text is written. This uses the external guess-language python module, and will fail and return Language(Undetermined) if it is not installed. """ try: from guess_language import guessLanguage return babelfish.Language.fromguessit(guessLanguage(text)) except ImportError: log.error('Cannot detect the language of the given text body, missing dependency: guess-language') log.error('Please install it from PyPI, by doing eg: pip install guess-language') return UNDETERMINED
def index(page=1): user = g.user form = PostForm() if form.validate_on_submit(): language = guessLanguage(form.post.data) if language == "UNKNOWN" or len(language) > 5: language = "" post = Post(body=form.post.data, timestamp=datetime.utcnow(), author=g.user, language=language) db.session.add(post) db.session.commit() flash(gettext("Your post is now live!")) return redirect(url_for("index")) posts = g.user.followed_posts().paginate(page, POSTS_PER_PAGE, False) title = "All the news unfit to print" return render_template("index.html", user=user, posts=posts, form=form)
words.append(word) for word in words: if word.istitle() == True: title = True word = word.lower() else: title = False splitword = re.findall(r"[\w']+|[.,!?;—]", word) word = splitword[0] if len(splitword) >= 2: p = True punc = splitword[1] else: p = False if guessLanguage(word) == language: # Worst way to find a match ever... translation = conn.cursor().execute( 'select ' + tl + ' from \'' + tablename + '\' where ' + language + ' = ? limit 1', (word, )).fetchone() if translation == None: translation = conn.cursor().execute( 'select ' + tl + ' from \'' + tablename + '\' where ' + language + ' like ? limit 1', ('%' + word + '%', )).fetchone() if translation == None: translation = conn.cursor().execute( 'select ' + tl + ' from \'' + tablename + '\' where ' + language + ' like ? limit 1', ('%' + word[:-2] + '%', )).fetchone() if translation == None:
def site_index(request): # preliminary site index page # TODO, possibly -- might be worth supporting HEAD requests # since this is the site in if request.method == 'GET': # on get request, initialize an empty form for display form = InputForm() elif request.method == 'POST': # on post, init form based on posted data # if form is invalid, redisplay input form with error messages form = InputForm(request.POST) if form.is_valid(): # actual logic here - infer search terms, query apis, display stuff text = form.cleaned_data['text'] zotero_user = form.cleaned_data['zotero_user'] search_terms = {} if zotero_user: request.session['username'] = zotero_user return HttpResponseRedirect( zotero.oauth_authorize_url(request)) elif text: lang = guess_language.guessLanguage(text) logger.debug('language detected as %s' % lang) common_terms = common_words(text, 15, lang) dbpedia_terms = get_search_terms(text, lang) # too many terms? phrase? didn't get results when combining # TODO: combine dbpedia + common terms; randomize from dbpedia results #search_terms['keywords'].extend(dbpedia_terms['keywords']) search_terms['keywords'] = list(dbpedia_terms['keywords'])[:10] # if no terms found in dbpedia, use common terms instead # (todo: should be some kind of combination) if not search_terms['keywords']: search_terms['keywords'] = common_terms['keywords'] # within dbpedia_terms there are now lists for # people # places # dates {'early': ,'late': } # people and places were reconciled against DBpedia. Dates contains # only four digit values and could be passed to # if for is valid, # for either text input or zotero where we got terms # print search_terms['keywords'] # store search terms in the session so we can redirect request.session['search_terms'] = search_terms # insert logic for processing zotero username here # zotero_user = form.cleaned_data['zotero_user'] # redirect # NOTE: should probably be http code 303, see other return HttpResponseRedirect(reverse('discoveries:view')) # if not valid: pass through and redisplay errors return render(request, 'core/site_index.html', {'input_form': form})
doc_id = sys.argv[1] docs = Document.objects.filter(doc_id=doc_id) if len(docs) == 0: print "Document %s not found" % doc_id sys.exit(1) doc = docs[0] asset_path = os.path.join(ORIGINAL_MEDIA_PATH, "%s" % doc.docfile) txt_path = get_txt_path(doc_id) type_of_file = get_type_of_file(asset_path) error_string = '' if type_of_file: converted = convert_file_to_txt(asset_path, txt_path, type_of_file) language = guess_language.guessLanguage(file(txt_path).read()) if not converted: error_string = ERROR_CONVERSION_ERROR if converted: generate_word_list(txt_path, language) else: converted = False error_string = ERROR_UNKNOWN_TYPE_OF_FILE doc.converted = converted doc.format_type = type_of_file doc.language = language doc.size = os.stat(asset_path).st_size doc.txtfile = txt_path print 'language =', doc.language
def test_guess(self): tests = [ ("This is a test of the language checker", "en"), ("Verifions que le détecteur de langues marche", "fr"), ("Sprawdźmy, czy odgadywacz języków pracuje", "pl"), ("авай проверить узнает ли наш угадатель русски язык", "ru"), ("La respuesta de los acreedores a la oferta argentina para salir del default no ha sido muy positiv", "es"), ("Сайлау нәтижесінде дауыстардың басым бөлігін ел премьер министрі Виктор Янукович пен оның қарсыласы, оппозиция жетекшісі Виктор Ющенко алды.", "kk"), # Kazakh ("милиция ва уч солиқ идораси ходимлари яраланган. Шаҳарда хавфсизлик чоралари кучайтирилган.", "uz"), # uzbek ("көрбөгөндөй элдик толкундоо болуп, Кокон шаарынын көчөлөрүндө бир нече миң киши нааразылык билдирди.", "ky"), # kyrgyz ("yakın tarihin en çekişmeli başkanlık seçiminde oy verme işlemi sürerken, katılımda rekor bekleniyor.", "tr"), ("Daxil olan xəbərlərdə deyilir ki, 6 nəfər Bağdadın mərkəzində yerləşən Təhsil Nazirliyinin binası yaxınlığında baş vermiş partlayış zamanı həlak olub.", "az"), # Azerbaijani (" ملايين الناخبين الأمريكيين يدلون بأصواتهم وسط إقبال قياسي على انتخابات هي الأشد تنافسا منذ عقود", "ar"), ("Американське суспільство, поділене суперечностями, збирається взяти активну участь у голосуванні", "uk"), # ukrainian ("Francouzský ministr financí zmírnil výhrady vůči nízkým firemním daním v nových členských státech EU", "cs"), # czech ("biće prilično izjednačena, sugerišu najnovije ankete. Oba kandidata tvrde da su sposobni da dobiju rat protiv terorizma", "hr"), # croatian (" е готов да даде гаранции, че няма да прави ядрено оръжие, ако му се разреши мирна атомна програма", "bg"), # bulgarian ("на јавното мислење покажуваат дека трката е толку тесна, што се очекува двајцата соперници да ја прекршат традицијата и да се појават и на самиот изборен ден.", "mk"), # macedonian ("în acest sens aparţinînd Adunării Generale a organizaţiei, în ciuda faptului că mai multe dintre solicitările organizaţiei privind organizarea scrutinului nu au fost soluţionate", "ro"), # romanian ("kaluan ditën e fundit të fushatës në shtetet kryesore për të siguruar sa më shumë votues.", "sq"), # albanian ("αναμένεται να σπάσουν παράδοση δεκαετιών και να συνεχίσουν την εκστρατεία τους ακόμη και τη μέρα των εκλογών", "el"), # greek (" 美国各州选民今天开始正式投票。据信,", "zh"), # chinese (" Die kritiek was volgens hem bitter hard nodig, omdat Nederland binnen een paar jaar in een soort Belfast zou dreigen te veranderen", "nl"), # dutch ("På denne side bringer vi billeder fra de mange forskellige forberedelser til arrangementet, efterhånden som vi får dem ", "da"), # danish ("Vi säger att Frälsningen är en gåva till alla, fritt och för intet. Men som vi nämnt så finns det två villkor som måste", "sv"), # swedish ("Nominasjonskomiteen i Akershus KrF har skviset ut Einar Holstad fra stortingslisten. Ytre Enebakk-mannen har plass p Stortinget s lenge Valgerd Svarstad Haugland sitter i", "nb"), # norwegian ("on julkishallinnon verkkopalveluiden yhteinen osoite. Kansalaisten arkielämää helpottavaa tietoa on koottu eri aihealueisiin", "fi"), # finnish ("Ennetamaks reisil ebameeldivaid vahejuhtumeid vii end kurssi reisidokumentide ja viisade reeglitega ning muu praktilise informatsiooniga", "et"), # estonian ("Hiába jön létre az önkéntes magyar haderő, hiába nem lesz többé bevonulás, változatlanul fennmarad a hadkötelezettség intézménye", "hu"), # hungarian ("հարաբերական", "hy"), # armenian ("Hai vấn đề khó chịu với màn hình thường gặp nhất khi bạn dùng laptop là vết trầy xước và điểm chết. Sau đây là vài cách xử lý chú", "vi"), ("ii", UNKNOWN), # This text has a mix of Hirigana, Katakana and CJK which requires the fix for issue:3 to classify correctly ("トヨタ自動車、フィリピンの植林活動で第三者認証取得 トヨタ自動車(株)(以下、トヨタ)は、2007年9月よりフィリピンのルソン島北部に位置するカガヤン州ペニャブラン", 'ja'), ] for text, name in tests: self.assertEquals(name, guessLanguage(text)) text = "Verifions que le détecteur de langues marche" self.assertEquals('fr', guessLanguageTag(text)) self.assertEquals('French', guessLanguageName(text)) self.assertEquals(26150, guessLanguageId(text)) self.assertEquals(('fr', 26150, 'French'), guessLanguageInfo(text))
def fetch_usr_tips(user_id): success = 0 retry = 0 content = '' while success == 0: try: super_token = 'QEJ4AQPTMMNB413HGNZ5YDMJSHTOHZHMLZCAQCCLXIX41OMP' fetch_url_str = 'https://api.foursquare.com/v2/users/' + str(user_id) + '/tips?oauth_token='+super_token + \ '&limit=5000&v=20141231' content = get_raw_info(fetch_url_str) if content != -1 and content != -2: success = 1 except: time.sleep(3) retry += 1 if retry == AUTO_RECONNECT_TIMES: return -2 output_dict = {} content_json = json.loads(content) output_dict['tips content'] = [] a = {} if content_json['meta']['code'] != 200: output_dict['error_meta'] = str(content_json['meta']['code']) if str(content_json['meta']['errorDetail']) == "Must provide a valid user ID or 'self.'": output_dict['user existence'] = '-1' return output_dict output_dict['count'] = content_json['response']['tips']['count'] for item in (content_json['response']['tips']['items']): if 'cc' in item['venue']['location']: venue_country = item['venue']['location']['cc'] else: venue_country = '-' a = {} a['len'] = len(item['text']) a['text'] = item['text'].encode('utf-8') a['venue name'] = item['venue']['name'].encode('utf-8') a['timespam'] = str(item['createdAt']) a['venue country'] = venue_country if 'photo' in item: a['photo'] = "y " else: a['photo'] = "n " cate_info = item['venue']['categories'] if len(cate_info) > 0: for xx in cate_info: a['category'] = get_venue_category(xx['name']) else: a['category'] = '-' tip_text = a['text'] tip_language = guess_language.guessLanguage(tip_text) if tip_language == 'en': testimonial = TextBlob(tip_text) polarity = testimonial.sentiment.polarity a['polarity'] = polarity else: a['polarity'] = '-' output_dict['tips content'].append(a) return output_dict
#!/usr/bin/env python # Takes an input and prints only the language specified # Usage: python print-lang.py [filename] [lang-code] # i.e., python print-lang.py 04.md ru or python print-lang.py 04.md en import sys import enchant from guess_language import guessLanguage input_file = sys.argv[1] lang = sys.argv[2] output_file = (input_file.rsplit(".", 1)[0]) + "_" + lang + ".md" myfile = open(input_file, 'r') output_file = open(output_file, 'w') for line in myfile: if guessLanguage(line) == lang: output_file.write(line + "\n") else: pass
def main(argv=None): """ Main function """ if argv is None: argv = sys.argv points = [] labels = [] ru = [] en = [] uk = [] pl = [] ru_labels = [] en_labels = [] uk_labels = [] pl_labels = [] points_labels = [] title_labels = dict() lang_labels = dict() type_labels = dict() type_file = open('../youTubeData/video_type', "r") type_line = type_file.readline() while not type_line == "": type_labels[type_line.split(";")[0]] = type_line.split(";")[1].strip() type_line = type_file.readline() lang_file = open('../youTubeData/manually_recognized', "r") lang_line = lang_file.readline() while not lang_line == "": lang_labels[lang_line.split(";")[0]] = lang_line.split(";")[1].strip() lang_line = lang_file.readline() title_file = open("../youTubeData/all_frames_stats_title", "r") line = " " manual = 0 while not line == "": line = title_file.readline() lbl = line.strip() # .lower().replace("stepan bandera","en") title = title_file.readline() title_labels[lbl] = title title_lang = guess_language.guessLanguage(strip_tags(title)) desc = title_file.readline() try: desc.split("No description available")[1] desc = "" except IndexError: pass desc_lang = guess_language.guessLanguage(strip_tags(desc)) line = title_file.readline() lang = guess_language.guessLanguage( strip_tags(title) + strip_tags(desc)) if lbl in lang_labels: print lbl, " found" continue print lbl, " not found" if lang in ['uk', 'ru', 'pl', 'en']: lang_labels[lbl] = lang else: manual += 1 print "------------------------------------------------------------" print title_lang, desc_lang print title print desc print "------------------------------------------------------------" l = raw_input("which language? ") lang_labels[lbl] = l print manual, " manually recognized" lang_file.close() lang_file_content = "" print lang_labels for key in lang_labels.keys(): lang_file_content += key + ";" + lang_labels[key] + "\r\n" print lang_file_content lang_file = open('../youTubeData/manually_recognized', "w") lang_file.write(lang_file_content) lang_file.close() text_file = open('../youTubeData/all_frames_stats', "r") line = " " counter = 0 while not line == "": line = text_file.readline() counter = counter + 1 try: values = map(float, line.split('\r\n')[0].split(';')[1:]) label = line.split(';')[0] labels.append(label) size, min_max, mean, variance, skew, kurt = scipy.stats.describe( values) # throw out vids under 20 seconds if size < 20: print "video too short: ", label continue # cut min variance at 0.001, otherwise the plot gets quite # distorted if variance < 0.001: variance = 0.001 except ValueError: print "error calculating stats for ", label continue point = [mean, variance] try: if lang_labels[label] == "uk": uk.append(point) uk_labels.append(label) elif lang_labels[label] == "ru": ru.append(point) ru_labels.append(label) elif lang_labels[label] == "en": en.append(point) en_labels.append(label) elif lang_labels[label] == "pl": pl.append(point) pl_labels.append(label) else: points.append(point) points_labels.append(label) except: points.append(point) points_labels.append(label) print counter, " lines read." print "number of labels ", str(len(labels)) print "number of labels ", str(len(lang)) pylab.show() pylab.xlabel('Mean') pylab.ylabel('Variance') # pylab.yscale("log") pylab.title("Frame Likenesses of Bandera Youtube Clips") pylab.plot(*zip(*points), marker='o', color='w', ls='') pylab.plot(*zip(*uk), marker='o', color='#ff8000', ls='') pylab.plot(*zip(*ru), marker='o', color='#b40404', ls='') pylab.plot(*zip(*pl), marker='o', color='#66FF00', ls='') pylab.plot(*zip(*en), marker='o', color='#819FF7', ls='') figure = pylab.gcf() figure.set_size_inches(figure.get_size_inches()[0] * 2, figure.get_size_inches()[1] * 2) figure.savefig('video_langs.png', bbox_inches='tight')
def detect_language(content): # import is inside the function, because it takes noticeable time import guess_language return guess_language.guessLanguage(content)
def tuling_reply(msg): # the main function used for message processing #print(msg) #print(msg["User"].split(",")[0].split(":")[2]) #(msg["FromUserName"]=="@d57c5b7f0fff1374fa5b38594ec49362") # 为了保证在图灵Key出现问题的时候仍旧可以回复,这里设置一个默认回复 # 如果图灵Key出现问题,那么reply将会是None # a or b的意思是,如果a有内容,那么返回a,否则返回b # 有内容一般就是指非空或者非None,你可以用`if a: print('True')`来测试 lang = guessLanguage( msg["Text"]) # guess the language type and give different response sender_alias = msg['User']['Alias'].replace( "_", "" ) # the unique id of senders(wechat users), attention: the FromUserId or something is not unique, which changes nexttime you log in. sender_city = msg['User']['Province'] + u'省' + msg['User']['City'] + u'市' sender_nickName = msg['User']['NickName'] if lang == 'en': # is the message type is english first_greetings_EN = u"Hello, I'm xiaobo belonging to Zhenbo Xu, may I help you? If it's a emergency, please contact my owner by phone call or SMS. I can chat with you if you want, send <start> to begin and send <stop> to shut me down." defaultReply_EN = u'OMG, this question stumped me. Could you please wait for my master?' if msg["Text"] == "start": # the sender start the service if sender_alias not in replylist: replylist.append( sender_alias) # add sender alias to identify the sender write_replyDB() #write to the file if sender_alias not in open_reminder: open_reminder.append(sender_alias) write_reminderDB() return u'xiaobo auto reply started' # notification send back to the sender elif msg["Text"] == "stop": # stop the service try: replylist.remove(sender_alias) except: pass write_replyDB() return u'xiaobo auto reply stopped' if sender_alias in replylist: # if the user in the list of who started the service, then process it special_reply = special_function(msg["Text"]) if special_reply: return special_reply return get_response( msg['Text']) or defaultReply_EN # .decode('unicode-escape') else: # to find whether it needs to notify him/her the exist of xiaobo if sender_alias not in open_reminder: return first_greetings_EN else: pass else: #默认为中文 first_greetings_CN = sender_nickName + u'你好啊。嘿嘿,我是徐振博家的小机器人小博,有什么可以帮助您的么.有急事请通过短信或者电话联系主人。我可以和您聊天,发送<开始>两个字就可以啦,发送<关闭>可以把烦人的我关掉。现在我会自动谷歌翻译,查天气,歇后语,查邮编查公交>等等。另外,如果有个功能您特别想要,可以联系我主人实现一下' defaultReply_CN = u'那个,这个机器人不会回答这个问题,不过他会谷歌翻译(回复<谷歌>查看详细信息),输地名查天气,查单词,歇后语,查邮编查公交等等。另外,如果有个功能您特别想要,可以联系我,我有空实现一下' if msg["Text"] == u'开始': if sender_alias not in replylist: replylist.append(sender_alias) write_replyDB() if sender_alias not in open_reminder: open_reminder.append(sender_alias) write_reminderDB() return u'小博自动回复已开启' elif msg["Text"] == u'关闭': try: replylist.remove(sender_alias) except: pass write_replyDB() return u'小博自动回复已关闭' if sender_alias in replylist: annoy_to_close = u' ' # to 提示小博可以被关闭 if len( msg["Text"] ) < 10: # if short than 20, then do feature identify, and give detailed query way. if u'关' in msg["Text"] or u'烦' in msg["Text"]: annoy_to_close = u'\n如果觉得我很烦,可以发送<关闭>二字关掉我哦-.- \n' special_reply = special_function(msg["Text"]) if special_reply: return special_reply tt = get_response(msg['Text'], sender_alias, sender_city) return tt + annoy_to_close or defaultReply_CN + annoy_to_close # .decode('unicode-escape') else: if sender_alias not in open_reminder: open_reminder.append(sender_alias) write_reminderDB() return first_greetings_CN elif u'小博' in msg["Text"]: return u'miss me? 你知道要发送"开始"两个字让我主人把我打开,对吧?' else: pass
try: extractor = Extractor(extractor='ArticleExtractor', url=url) extracted_text = extractor.getText() #skip text if size is less than threshold if len(extracted_text) < 500: print( "\n***SIZE IS TOO SMALL, TEXT IS EXCLUDED!!! (in main.py, extractor)\n" ) # sleep for 4 seconds before trying crawling agian, otherwise you will be identified and blocked time.sleep(4) continue #skip text if the language is not same as requested lang_tmp = guessLanguage(extracted_text).encode('utf-8') if lang_tmp != args.lang: print( "\n***WRONG LANGUAGE!!! (in main.py, guessLanguage)" ) # sleep for 4 seconds before trying crawling agian, otherwise you will be identified and blocked time.sleep(4) continue except: #if 'request timeout' happens go to the next URL e = sys.exc_info()[0] print("\n***ERROR (in main.py, extractor 2): " + str(e)) # sleep for 4 seconds before trying crawling agian, otherwise you will be identified and blocked time.sleep(4) continue #article_id += 1
#!/usr/bin/env python from guess_language import guessLanguage enguess = guessLanguage('hi') if str(enguess) == 'en': print("PASS") else: print("FAIL")