def main(args): """ Read and write data into remote redis server """ from utils import charset_wrapper import redis r = redis.StrictRedis(host='172.18.28.118') syslog = xuxian.log.system_logger syslog.info('loading wikiline2entity file....') build_entity_wikilink_map(charset_wrapper(open(args.mid2wiki)), r) syslog.info('loading redirect file....') build_redirect_wikilink_map(charset_wrapper(open(args.redirect)), r) syslog.info('finished init global object')
def main(args): recovery_state = xuxian.recall(args.task_id) syslog = xuxian.log.system_logger Dict = xuxian.log.LogDict # init global object docs = wikiobj_to_doc(charset_wrapper(open(args.wiki_file))) nlp = init_corenlp(args.nlp_server, args.nlp_port) robj = redis.StrictRedis(host=args.redis_server, port=args.redis_port, db=args.redis_db) # init output dump file entity_outfile = xuxian.apply_dump_file('entity', args.single_entity_output_file) entity_pair_outfile = xuxian.apply_dump_file('entity-pair', args.entity_pair_output_file) # iterate over data input for doc in docs: syslog.info('to process doc_title=' + doc['title'].encode('utf-8')) for (lineno, line) in enumerate(doc['text']): # at the correct time point, clear the recovery state if recovery_state == doc['title'] + str(lineno): recovery_state = None if recovery_state is not None: continue # every line is a paragraph in wikipedia line = line.rstrip() if not line: continue plaintext = get_plain_text(line) mentions = get_plain_text_mention_info(line) syslog.debug(Dict({'plaintext' : plaintext[:100].encode('utf-8'), 'mention':str(mentions)})) depparsed_output = depparse_paragraph(plaintext, nlp) if u'sentences' not in depparsed_output: # TODO: empty ? continue sentences = depparsed_output[u'sentences'] syslog.debug('to process doc_title=' + doc['title'].encode('utf-8') + '\tdoc_line=' + plaintext[:80].encode('utf-8')) process_paragraph_single_entity(sentences, mentions, robj, entity_outfile) process_paragraph_multiple_entity(sentences, mentions, robj, entity_pair_outfile) xuxian.remember(args.task_id, (doc['title'] + unicode(lineno)).encode('utf-8'))
def test(args): """ check for all key if they still exist in remote redis server """ from utils import charset_wrapper import redis r = redis.StrictRedis(host='172.18.28.118') syslog = xuxian.log.system_logger for entity, wikilink in (line.strip().split('\t') for line in charset_wrapper(open(args.mid2wiki))): wikilink = upper_first_letter(wikilink) res = r.get('wiki2mid' + wikilink.encode('utf-8')) if res != entity.encode('utf-8'): syslog.info((u'err_wiki2mid\tkey=' + wikilink + u'\tval=' + entity).encode('utf-8')) else: syslog.info('ok_wiki2mid\tkey=' + wikilink.encode('utf-8')) for link1, link2 in (line.rstrip().split('\t') for line in charset_wrapper(open(args.redirect))): link1, link2 = upper_first_letter(link1), upper_first_letter(link2) res = r.get('redir' + link1.encode('utf-8')) if res != link2.encode('utf-8'): syslog.info((u'err_redir\tkey=' + link1 + u'\tval=' + link2).encode('utf-8')) else: syslog.info('ok_redir\tkey=' + link1.encode('utf-8'))
def build_key_properties_table(event_schema_file): """ Read the schema file, return a object as {event_type: set_of_key_properties} """ keytable, evtype, event = {}, None, set() for line in charset_wrapper(open(event_schema_file)): line = line.rstrip() if line.startswith(u'\t'): event.add(line.lstrip()) else: if evtype is not None: keytable[evtype] = event evtype = line event = set() keytable[evtype] = event return keytable
def find_context_sentence_for_events(robj, outfile, event_schema, string_to_mid, sentence_entity_file): evtypes = event_schema.keys() for sentence, mentions in sentence_reader( charset_wrapper(open(sentence_entity_file))): mention_pos = dict((m[0], (m[1], m[2])) for m in mentions) rkeys = [ make_rkey(evtype, (m1[0], m2[0])) for evtype, m1, m2 in enumerate_rkeys(evtypes, mentions) ] for rkey in rkeys: data = robj.get(rkey) if data is None: continue evdata = json.loads(data) output_ev_context(outfile, sentence, mention_pos, evdata)
doc_obj['url'] = m.group(4) doc_obj['title'] = m.group(6) doc_obj['text'] = [] continue if not state_out_of_doc and DOC_END_PATTERN.match(line): state_out_of_doc = True yield doc_obj continue if not state_out_of_doc: doc_obj['text'].append(line) continue raise ValueError('failed with: state_out_of_doc=%s line=%s' % (state_out_of_doc, line)) if __name__ == "__main__": import sys from utils import charset_wrapper class PrettyDoc(dict): def __str__(self): return ("id:\t" + str(self['id']) + "\n" + "url:\t" + str(self['url']) + "\n" + "title:\t" + str(self['title']) + "\n" + "\n".join(self['text'][:5])) doc = wikiobj_to_doc(charset_wrapper(open(sys.argv[1], 'r'))).next() print PrettyDoc(doc)
def build_string_mid_table(mid_to_entity_file): """ Turn a tab-separated file <mid, string> into a dict <string, mid> """ return dict((s.strip(), m) for (m, s) in ( line.rstrip().split('\t') for line in charset_wrapper(open(mid_to_entity_file, 'r'))))
mention_len = pat_end - pat_start cursor += mention_len plain_mention_cursor += len(mention_repr) # no more mention in tail, cursor info is trival now return plain_mentions if __name__ == "__main__": from utils import charset_wrapper from wiki_doc import wikiobj_to_doc from entity_mentions import get_entity_mentions, get_entity_mentions_in_lines from entity_mentions import get_plain_text, get_plain_text_mention_info import sys doc = wikiobj_to_doc(charset_wrapper(open(sys.argv[1], 'r'))).next() # unit test for get_entity_mentions and get_entity_mentions_in_lines print "\n".join("%3d" % i + "\t" + str(mention) + "\t====>\t" + ", ".join(x.encode('utf-8') for x in (mention[2], mention[3])) for (i, mention) in get_entity_mentions_in_lines(doc['text'])) # unit test for get_plain_text and get_plain_text_mention_info for line in doc['text']: plain_line = get_plain_text(line) mentions = get_plain_text_mention_info(line) if not mentions: continue print "\nplain_line:\t" + plain_line.encode('utf-8').rstrip()