def test_regex(fp, verbose=True): """ Read test lines from file. Should be inputstring,expected regex groupname. If no expected regex group name is found then keep going """ from find_all_scriptures import find_all_scriptures, filtergroupdict print "Reading", fp.name i = 0 found_books = set() for line in fp: found = False line = line.strip().lower() if line.find(",") != -1: line, answer = line.split(",") else: answer = None matches = find_all_scriptures(line) for ma in matches: # print ma.groups() # print ma.groupdict() # print ma.lastgroup # print ma.lastindex ret = filtergroupdict(ma) if answer and ret["book"] + " " + ret["verse"] != answer: print "%s matched wrong regex %s, should be=%s" % (line, ret["book"], answer) sys.exit(1) if verbose: print "%s,%s %s" % (line, ret["book"], ret["verse"]) found_books.add(ret["book"]) found = True if not found and answer: print "Failed to match ", line, found, answer sys.exit(1) print "Found %d distinct books" % len(found_books)
def fix_results(fn, outputdir="/tmp/", show_misses=True): """ Assumes the caller has generated a new regex and wants to fix the results captured with the old regex. Read a JSON results file containing tweets captured by habakkuk and show any line does not match. """ from find_all_scriptures import find_all_scriptures, filtergroupdict import gzip, copy, traceback fp = None found_match_cnt = 0 miss_match_cnt = 0 if fn.endswith("gz"): fp = gzip.open(fn) found_match_fp = gzip.open(os.path.join(outputdir, os.path.basename(fn)), "w") else: fp = open(fn) found_match_fp = open(os.path.join(outputdir, os.path.basename(fn)), "w") bv_set = set([line.strip() for line in open("./analysis/join_data/bibleverses.txt")]) print "Reading", fn print "Writing fixed file to", found_match_fp.name print "" try: for line in fp: res = json.loads(line) txt = res["text"].lower() matches = [ma for ma in find_all_scriptures(txt)] if len(matches) is 0 or res["bibleverse"] not in bv_set: miss_match_cnt += 1 if show_misses: print "missed", line else: found_match_fp.write(line) found_match_cnt += 1 ret = filtergroupdict(ma) newres = copy.deepcopy(res) newres["matext"] = ma.string[ma.start() : ma.end()].replace("\r\n", " ") # actual matched string newres["book"] = ret["book"] newres["bibleverse"] = " ".join((ret["book"], ret["verse"])) if newres["bibleverse"] != res["bibleverse"]: print "Matched verse changed from %s to %s - text '%s'\n" % ( res["bibleverse"], newres["bibleverse"], unicode(res["text"]).encode("ascii", errors="ignore"), ) except Exception, ex: print "Failure!!!" print "line", line print "regex returned", ret print "traceback", "".join(traceback.format_exception(*sys.exc_info()))
def process(self,tup): res = tup.values[0] # storm.log("python (in) =%s"%tup.values[0]) txt = res['text'].lower() tweetid = res["tweetid"] matches = find_all_scriptures(txt) for ma in matches: ret = filtergroupdict(ma) res['matext'] = ma.string[ma.start():ma.end()].replace('\r\n',' ') #actual matched string res['book'] = ret['book'] res['bibleverse'] = " ".join((ret['book'],ret['verse'])) # storm.log("python (out) =%s"%res) storm.emit([res])