def parse_file(self, path: str) -> [BibItem]:
		"""
		Parses file at given path, handling utf-8-bom correctly.
		@returns list of parsed BibItem
		"""
		if not os.path.isfile(path):
			raise Exception("Path to file expected")
		
		data = utils.read_utf8_file(path)
		try:
			source_file = os.path.basename(path)
			items = self.parse_string(data)
			for item in items:
				self.set_item_param(item, "source", "{source}:{line:04d}".format(
					source=source_file, 
					line=item.get("source")))
			return items				
		except Exception as ex:
			raise Exception("While parsing {0}: {1}".format(path, ex))
Example #2
0
	def _parse_file(self, path):
		"""
		Parses file at given path, handling utf-8-bom correctly.
		@returns list of parsed BibItem
		"""
		if not os.path.isfile(path):
			raise Exception("Path to file expected")

		data = utils.read_utf8_file(path)
		try:
			source_file = os.path.basename(path)
			items = self._parse_string(data)
			for item in items:
				self.set_item_param(item, "source_file", source_file)
				self.set_item_param(item, "source", "{source_file}:{source_line:04d}".format(
					source_file=source_file,
					source_line=item.get("source_line"))
				)
			return items
		except Exception as ex:
			raise Exception("While parsing {0}: {1}".format(path, ex))
Example #3
0
        return []
    ret = {}
    for term, freq in query_tfs.iteritems():
        nt = num_docs_with_word.get(term, 0)
        if nt == 0:
            continue
        if scheme not in {2, 3}:
            ret[term] = (1 + (1.0 * freq / max_term_freq)) * math.log(N / nt)
        elif scheme == 2:
            ret[term] = 1 + (1.0 * freq / max_term_freq)
        else:
            ret[term] = (1 + math.log(freq)) * math.log(N / nt)
    return ret


data = utils.read_utf8_file("/Users/stps/code/ytp_webapps/src/files/emojipedia_and_unicode.json")
emoji_data = json.loads(data)

word_to_freq = defaultdict(int)
num_docs_with_word = defaultdict(int)
about_wt = 1
alias_wt = 3
name_wt = 5
word_to_docs = defaultdict(list)
N = len(emoji_data)
for ed in emoji_data:
    about = utils.safe_dct_get(ed, ['emojipedia', 'about'], default_val='')
    aliases = utils.safe_dct_get(ed, ['emojipedia', 'aliases'], default_val=[])
    name = utils.safe_dct_get(ed, ['emojipedia', 'name', 'emoji_name'], default_val='')
    tfs = defaultdict(int)
    update_tf_and_idf(string=about, tf=tfs, word_to_freq=word_to_freq, wt=about_wt)
Example #4
0
import json
from collections import defaultdict as ddict
import db
import requests

from initlogging import getlog
import utils
from bs4 import BeautifulSoup as bsoup

LOG = getlog(__name__)

data = utils.read_utf8_file("files/emoji_all.json")
emojis = json.loads(data)
word_to_emoji = ddict(list)
emoji_to_info = {}
cache = db.RedisTable('emoji-cache')

for emoji in emojis:
    emoji_to_info[emoji['emoji']] = emoji
    for kw in emoji['keywords']:
        kw = kw.lower()
        word_to_emoji[kw].append(emoji)
        if kw == 'flag':
            is_flag = True
    word_to_emoji[emoji['name'].lower()].append(emoji)


def search_emojipedia(query):
    html = cache.get(query)
    if html is None:
        url = u'http://emojipedia.org/search/?q={}'.format(query)