def parse_file(self, path: str) -> [BibItem]: """ Parses file at given path, handling utf-8-bom correctly. @returns list of parsed BibItem """ if not os.path.isfile(path): raise Exception("Path to file expected") data = utils.read_utf8_file(path) try: source_file = os.path.basename(path) items = self.parse_string(data) for item in items: self.set_item_param(item, "source", "{source}:{line:04d}".format( source=source_file, line=item.get("source"))) return items except Exception as ex: raise Exception("While parsing {0}: {1}".format(path, ex))
def _parse_file(self, path): """ Parses file at given path, handling utf-8-bom correctly. @returns list of parsed BibItem """ if not os.path.isfile(path): raise Exception("Path to file expected") data = utils.read_utf8_file(path) try: source_file = os.path.basename(path) items = self._parse_string(data) for item in items: self.set_item_param(item, "source_file", source_file) self.set_item_param(item, "source", "{source_file}:{source_line:04d}".format( source_file=source_file, source_line=item.get("source_line")) ) return items except Exception as ex: raise Exception("While parsing {0}: {1}".format(path, ex))
return [] ret = {} for term, freq in query_tfs.iteritems(): nt = num_docs_with_word.get(term, 0) if nt == 0: continue if scheme not in {2, 3}: ret[term] = (1 + (1.0 * freq / max_term_freq)) * math.log(N / nt) elif scheme == 2: ret[term] = 1 + (1.0 * freq / max_term_freq) else: ret[term] = (1 + math.log(freq)) * math.log(N / nt) return ret data = utils.read_utf8_file("/Users/stps/code/ytp_webapps/src/files/emojipedia_and_unicode.json") emoji_data = json.loads(data) word_to_freq = defaultdict(int) num_docs_with_word = defaultdict(int) about_wt = 1 alias_wt = 3 name_wt = 5 word_to_docs = defaultdict(list) N = len(emoji_data) for ed in emoji_data: about = utils.safe_dct_get(ed, ['emojipedia', 'about'], default_val='') aliases = utils.safe_dct_get(ed, ['emojipedia', 'aliases'], default_val=[]) name = utils.safe_dct_get(ed, ['emojipedia', 'name', 'emoji_name'], default_val='') tfs = defaultdict(int) update_tf_and_idf(string=about, tf=tfs, word_to_freq=word_to_freq, wt=about_wt)
import json from collections import defaultdict as ddict import db import requests from initlogging import getlog import utils from bs4 import BeautifulSoup as bsoup LOG = getlog(__name__) data = utils.read_utf8_file("files/emoji_all.json") emojis = json.loads(data) word_to_emoji = ddict(list) emoji_to_info = {} cache = db.RedisTable('emoji-cache') for emoji in emojis: emoji_to_info[emoji['emoji']] = emoji for kw in emoji['keywords']: kw = kw.lower() word_to_emoji[kw].append(emoji) if kw == 'flag': is_flag = True word_to_emoji[emoji['name'].lower()].append(emoji) def search_emojipedia(query): html = cache.get(query) if html is None: url = u'http://emojipedia.org/search/?q={}'.format(query)