Beispiel #1
0
def test_regex(fp, verbose=True):
    """ Read test lines from file. Should be inputstring,expected regex groupname.
    If no expected regex group name is found then keep going """
    from find_all_scriptures import find_all_scriptures, filtergroupdict

    print "Reading", fp.name
    i = 0
    found_books = set()
    for line in fp:
        found = False
        line = line.strip().lower()
        if line.find(",") != -1:
            line, answer = line.split(",")
        else:
            answer = None
        matches = find_all_scriptures(line)
        for ma in matches:
            # print ma.groups()
            # print ma.groupdict()
            # print ma.lastgroup
            # print ma.lastindex
            ret = filtergroupdict(ma)
            if answer and ret["book"] + " " + ret["verse"] != answer:
                print "%s matched wrong regex %s, should be=%s" % (line, ret["book"], answer)
                sys.exit(1)
            if verbose:
                print "%s,%s %s" % (line, ret["book"], ret["verse"])
            found_books.add(ret["book"])
            found = True
        if not found and answer:
            print "Failed to match ", line, found, answer
            sys.exit(1)

    print "Found %d distinct books" % len(found_books)
Beispiel #2
0
def fix_results(fn, outputdir="/tmp/", show_misses=True):
    """ Assumes the caller has generated a new regex and wants to fix the
    results captured with the old regex. Read a JSON results file containing
    tweets captured by habakkuk and show any line does not match. """
    from find_all_scriptures import find_all_scriptures, filtergroupdict
    import gzip, copy, traceback

    fp = None
    found_match_cnt = 0
    miss_match_cnt = 0

    if fn.endswith("gz"):
        fp = gzip.open(fn)
        found_match_fp = gzip.open(os.path.join(outputdir, os.path.basename(fn)), "w")
    else:
        fp = open(fn)
        found_match_fp = open(os.path.join(outputdir, os.path.basename(fn)), "w")

    bv_set = set([line.strip() for line in open("./analysis/join_data/bibleverses.txt")])

    print "Reading", fn
    print "Writing fixed file to", found_match_fp.name
    print ""

    try:
        for line in fp:
            res = json.loads(line)
            txt = res["text"].lower()
            matches = [ma for ma in find_all_scriptures(txt)]

            if len(matches) is 0 or res["bibleverse"] not in bv_set:
                miss_match_cnt += 1
                if show_misses:
                    print "missed", line
            else:
                found_match_fp.write(line)
                found_match_cnt += 1
                ret = filtergroupdict(ma)
                newres = copy.deepcopy(res)
                newres["matext"] = ma.string[ma.start() : ma.end()].replace("\r\n", " ")  # actual matched string
                newres["book"] = ret["book"]
                newres["bibleverse"] = " ".join((ret["book"], ret["verse"]))
                if newres["bibleverse"] != res["bibleverse"]:
                    print "Matched verse changed from %s to %s - text '%s'\n" % (
                        res["bibleverse"],
                        newres["bibleverse"],
                        unicode(res["text"]).encode("ascii", errors="ignore"),
                    )
    except Exception, ex:
        print "Failure!!!"
        print "line", line
        print "regex returned", ret
        print "traceback", "".join(traceback.format_exception(*sys.exc_info()))
Beispiel #3
0
 def process(self,tup):
     res = tup.values[0]
     # storm.log("python (in) =%s"%tup.values[0])
     txt = res['text'].lower()
     tweetid = res["tweetid"]
     matches = find_all_scriptures(txt)
     for ma in matches:
         ret = filtergroupdict(ma)
         res['matext'] = ma.string[ma.start():ma.end()].replace('\r\n',' ') #actual matched string
         res['book'] = ret['book']
         res['bibleverse'] = " ".join((ret['book'],ret['verse']))
         # storm.log("python (out) =%s"%res)
         storm.emit([res])