def bbc_parser(doc, category): try: signal.alarm(15) s = doc.split("\n") title = s[0] _id = normalize_title(title) tn = TextNormalizer() pos, nouns, ners = semantics(doc) nouns = tn.fmap(nouns) ners = tn.fmap(ners) paragraphs = [] sentences = [] for line in s[1:]: if not line: continue sentences += tn.fmap(sentence_tokenize(line)) paragraphs.append( tn.normalize(line) ) signal.alarm(0) return { "id" : _id, "title" : title, "sents" : sentences, "paras" : paragraphs, "pos" : pos, "nouns" : nouns, "ners" : ners } except Exception as e: print("Could not process article") signal.alarm(0) return None
def read_wikistats(lang, f): """ Read wikistats and process redirect pages """ try: for line in f: try: field = line.split() if lang == field[0]: page = field[1] if utils.is_valid_title(page) and utils.is_title_in_ns0(page): if not REDIRECTS: print line, else: title = utils.normalize_title(page) if title: pagecounts[title] = pagecounts.get(title, 0) + int(field[2]) except UnicodeError: sys.stderr.write("UnicodeError: %s" % line) except IndexError: sys.stderr.write("IndexError: %s" % line) except IOError: sys.stderr.write("IOError") finally: if f: f.close()
def new(cls, name): tag = Tag(name=name, normal=normalize_title(name)) tag.put() return tag