def parse_nominatim(dbo, jr, j, q): if len(j) == 0: al.debug("no response from nominatim for %s (response %s)" % (q, str(jr)), "geo.parse_nominatim", dbo) return None try: latlon = "%s,%s,%s" % (str(utils.strip_non_ascii(j[0]["lat"])), str(utils.strip_non_ascii(j[0]["lon"])), "na") al.debug("contacted nominatim to get geocode for %s = %s" % (q, latlon), "geo.parse_nominatim", dbo) return latlon except Exception,err: al.error("couldn't find geocode in nominatim response: %s, %s" % (str(err), jr), "geo.parse_nominatim", dbo) return None
def parse(self): h = self.address_hash() j = self.json_response if len(j) == 0: al.debug( "no response from nominatim for %s (response %s)" % (self.url, str(self.response)), "geo.parse_nominatim", self.dbo) return "0,0,%s" % h try: latlon = "%s,%s,%s" % (str(utils.strip_non_ascii( j[0]["lat"])), str(utils.strip_non_ascii(j[0]["lon"])), h) al.debug( "contacted nominatim to get geocode for %s = %s" % (self.url, latlon), "geo.parse_nominatim", self.dbo) return latlon except Exception as err: al.error( "couldn't find geocode in nominatim response: %s, %s" % (str(err), self.response), "geo.parse_nominatim", self.dbo) return "0,0,%s" % h
def generate_tweet_text(mood): filename = ("emotions/{}.txt").format(mood) with open(filename, encoding='utf-8') as f: text = f.read() text = utils.strip_non_ascii(text) text_model = markovify.Text(text) sentence = text_model.make_short_sentence(120) # generate short tweet synonymset = dictionary.synonym(mood) synonym = choice(synonymset) sentence += " #{}".format(synonym) # generate hashtag return sentence.encode('utf-8')
def gks(m, f): """ reads field f from map m, returning a string. string is empty if key not present """ if f not in m: return "" return str(utils.strip_non_ascii(m[f]))
db = conn.sentiment_analysis_db import pickle path = '../files/hindi/' word2Synset = pickle.load(open(path + "WordSynsetDict.pk")) # dmetaphone = fuzzy.DMetaphone() soundex = fuzzy.Soundex(4) print db.hindi_dictionary.drop_indexes() print db.hindi_dictionary.remove({}) words = [] for word in word2Synset.keys(): transliterated = strip_non_ascii(transliterate(word, DEVANAGARI, HK)) synsets = [] for vv in word2Synset[word].values(): synsets.extend(vv) lower = transliterated.lower() sound = soundex(lower.decode('ascii', errors='ignore')) words.append({ 'word': word, 'synsets': synsets, 'transliteration': lower, 'sound': sound }) if len(words) > 1000: db.hindi_dictionary.insert_many(words) words = []
import fuzzy import sys conn = MongoClient() db = conn.sentiment_analysis_db soundex = fuzzy.Soundex(4) for line in open('../../resources/word-frequency-hindi.txt'): line = line.strip() word, freq = line.split('\t') word = word.decode('utf-8') # .replace('\0xef', '') found = db.hindi_dictionary.find_one({'word': word}) if not found: transliterated = transliterate(word, DEVANAGARI, HK) transliterated = strip_non_ascii(transliterated) found = db.hindi_dictionary.find_one( {'transliterated': transliterated}) if not found: sound = soundex(transliterated) sounding_same = list(db.hindi_dictionary.find({'sound': sound})) if len(sounding_same) > 0: found = sorted([(i['word'], distance(word, i['word'])) for i in sounding_same], key=lambda x: x[1])[0][0] else: found = found['word'] else: found = found['word'] print word, found