Beispiel #1
0
def getDateInfo(filename):

    regexes = []
    totalhtml = ""
    count = 1
    linenum = 0
    lines = dcutils.loadLines(filename)
    try:
        printQuestion("Interactively Learning Date Formats.")
        printedInstructions = False
        for line in lines:
            line = line.strip()
            linenum += 1
            prettyLine = formatLine(line, 100, "\t")
            if not possiblyHasTimeStamp(line):
                print "Skipping unpromissing line " + str(linenum) + "."
            else:
                timestamp = guessTimestamp(regexes, line)
                if timestamp != None:
                    print "Parsed Date on line " + str(linenum) + "."  #    Time = ", timestamp, "\n", prettyLine
                    continue
                if not printedInstructions:
                    printedInstructions = True
                    print Instruction

                # print "\n" + "\nUnable to get time on this line:\n" + "-"*80 + "\n" + prettyLine + "\n" + "-"*80
                print "\nSAMPLE LINE " + str(linenum) + ":\n" + prettyLine + "\n" + "-" * 80
                # if askYesNoQuestion("Did we parse this correctly?"): continue
                while True:
                    timeformat = prompt(
                        "timestamp values as: month, day, year, hour, minute, second, ampm, timezone.\n\t"
                    )
                    if timeformat == "":
                        break  # user says there is no timestamp on this line
                    else:
                        timevalues = [v.lower().strip() for v in timeformat.split(",")]
                        formatname = genformatname(filename, count)
                        html, regex = learnrobRegex(formatname, line, timevalues)
                        if html != None:
                            print "Learned pattern."
                            # print html
                            # print regex
                            if not regex in regexes:
                                totalhtml += html
                                count += 1
                                regexes.append(regex)
                            break
                        else:
                            print "Unable to learn pattern.  Enter the timestamp values again.  If there is no timestamp on this line, just hit Enter."
                printQuestion("If you are satisfied that the timestamps formats have been learned, hit Control-C.")

    except KeyboardInterrupt:
        pass
    except Exception:
        import traceback

        traceback.print_exc()
    return regexes, totalhtml
def getDateInfo(filename):

    regexes = []
    totalhtml = ""
    count = 1
    linenum = 0
    lines = dcutils.loadLines(filename)
    try:
        printQuestion("Interactively Learning Date Formats.")
        printedInstructions = False
        for line in lines:
            line = line.strip()
            linenum += 1
            prettyLine = formatLine(line, 100, "\t")
            if not possiblyHasTimeStamp(line):
                print "Skipping unpromissing line " + str(linenum) + "."
            else:
                timestamp = guessTimestamp(regexes, line)
                if timestamp != None:
                    print "Parsed Date on line " + str(linenum) + "."  #    Time = ", timestamp, "\n", prettyLine
                    continue
                if not printedInstructions:
                    printedInstructions = True
                    print Instruction
    
                #print "\n" + "\nUnable to get time on this line:\n" + "-"*80 + "\n" + prettyLine + "\n" + "-"*80
                print "\nSAMPLE LINE " + str(linenum) + ":\n" + prettyLine + "\n" + "-"*80
                    #if askYesNoQuestion("Did we parse this correctly?"): continue
                while (True):
                    timeformat = prompt("timestamp values as: month, day, year, hour, minute, second, ampm, timezone.\n\t")
                    if timeformat == "":
                        break  # user says there is no timestamp on this line
                    else:
                        timevalues = [v.lower().strip() for v in timeformat.split(",")]
                        formatname = genformatname(filename, count)
                        html, regex = learnrobRegex(formatname, line, timevalues)
                        if html != None:
                            print "Learned pattern."
                            #print html
                            #print regex
                            if not regex in regexes:
                                totalhtml += html
                                count += 1
                                regexes.append(regex)
                            break
                        else:
                            print "Unable to learn pattern.  Enter the timestamp values again.  If there is no timestamp on this line, just hit Enter."
                printQuestion("If you are satisfied that the timestamps formats have been learned, hit Control-C.")
                        
    except KeyboardInterrupt:
        pass
    except Exception:
        import traceback
        traceback.print_exc()
    return regexes, totalhtml
Beispiel #3
0
def printLineSamplings(filenames, maxPerFile = 10, searchKey = None):
    import random
    for fn in filenames:
        lines = dcutils.loadLines(fn)
        print "\nSample lines from", fn, "\n------------------------------------------------------------------------"
        count = 1
        for line in lines:
            line = line.strip()
            if random.randint(1,10) == 1 and len(line) > 20 and (searchKey == None or searchKey in line):
                print "\t", line
                count += 1
                if count > maxPerFile:
                    break
    print
def printLineSamplings(filenames, maxPerFile = 10, searchKey = None):
    import random
    for fn in filenames:
        lines = dcutils.loadLines(fn)
        print "\nSample lines from", fn, "\n------------------------------------------------------------------------"
        count = 1
        for line in lines:
            line = line.strip()
            if random.randint(1,10) == 1 and len(line) > 20 and (searchKey == None or searchKey in line):
                print "\t", line
                count += 1
                if count > maxPerFile:
                    break
    print
def learnFieldRulesFromFile(filename, fieldname, goodTerms, badTerms, maxLines, first):
    filetype = getFileType(filename)
    _printTiming("Got filetype: " + filetype)
    lines = dcutils.loadLines(filename)
    if len(lines) > maxLines:
        lines = lines[:maxLines]
        if first:
            print "Large training file.  Limiting learning to first", maxLines, "lines of", filename
    print "Learning..."
    _printTiming("Loaded lines")
    # strictRules = _generateRules(filetype, fieldname, lines, goodTerms, None)
    looseRules = _generateRules(filetype, fieldname, lines, goodTerms, "Loose")
    #rules = strictRules + looseRules
    rules = looseRules
    _printTiming("Generated lines")
    if _debug > 0: print "Rules Generated:",  len(rules)
    newTerms = _validateRules(lines, goodTerms, badTerms, rules)
    _printTiming("Validated lines")
    if _debug > 0: print "Rules Approved:",  len(rules)
    #if rulesdict != None and len(rulesfile) > 0:
    #  saveRules(rulesfile, rules)
    return rules, newTerms
Beispiel #6
0
def learnFieldRulesFromFile(filename, fieldname, goodTerms, badTerms, maxLines,
                            first):
    filetype = getFileType(filename)
    _printTiming("Got filetype: " + filetype)
    lines = dcutils.loadLines(filename)
    if len(lines) > maxLines:
        lines = lines[:maxLines]
        if first:
            print "Large training file.  Limiting learning to first", maxLines, "lines of", filename
    print "Learning..."
    _printTiming("Loaded lines")
    # strictRules = _generateRules(filetype, fieldname, lines, goodTerms, None)
    looseRules = _generateRules(filetype, fieldname, lines, goodTerms, "Loose")
    #rules = strictRules + looseRules
    rules = looseRules
    _printTiming("Generated lines")
    if _debug > 0: print "Rules Generated:", len(rules)
    newTerms = _validateRules(lines, goodTerms, badTerms, rules)
    _printTiming("Validated lines")
    if _debug > 0: print "Rules Approved:", len(rules)
    #if rulesdict != None and len(rulesfile) > 0:
    #  saveRules(rulesfile, rules)
    return rules, newTerms
    argv = sys.argv
    rulesfile = "rules.xml"
    if argc == 6:
        filename = argv[1]
        fieldname = argv[2]
        rulesfile = argv[3]
        goodstr = argv[4]
        badstr = argv[5]
        goodterms = set([v.strip() for v in goodstr.split(",")])
        badterms = set([v.strip() for v in badstr.split(",")])
        #rules, newterms = learnFieldRulesFromFile(rulesfile, filename,  fieldname, goodterms, badterms)
        rules, newterms = interactivelyLearn(filename,  fieldname, goodterms, badterms, 5, 10000)
        print len(rules), "rules"
        print "Terms: ", newterms
    elif argc == 3:
        filename = argv[1]
        rulesfile = argv[2]
        filetype = getFileType(filename)
        rulesdict = {}
        lines = dcutils.loadLines(filename)
        for line in lines:
            extractions = getExtractions(rulesdict, filetype, line)
            if extractions != None:
                print line
                print "\t", extractions
        
    else:
        print 'Usage \n'
        print '\tTo Train: \t' + argv[0] + ' <file> <fieldname> <rulesfile. empty "" to not save> "<good terms comma separated>" "<bad terms comma separated>"'
        print '\tTo Run: \t' + argv[0] + ' <file> <rulesfile>'
Beispiel #8
0
    if argc == 6:
        filename = argv[1]
        fieldname = argv[2]
        rulesfile = argv[3]
        goodstr = argv[4]
        badstr = argv[5]
        goodterms = set([v.strip() for v in goodstr.split(",")])
        badterms = set([v.strip() for v in badstr.split(",")])
        #rules, newterms = learnFieldRulesFromFile(rulesfile, filename,  fieldname, goodterms, badterms)
        rules, newterms = interactivelyLearn(filename, fieldname, goodterms,
                                             badterms, 5, 10000)
        print len(rules), "rules"
        print "Terms: ", newterms
    elif argc == 3:
        filename = argv[1]
        rulesfile = argv[2]
        filetype = getFileType(filename)
        rulesdict = {}
        lines = dcutils.loadLines(filename)
        for line in lines:
            extractions = getExtractions(rulesdict, filetype, line)
            if extractions != None:
                print line
                print "\t", extractions

    else:
        print 'Usage \n'
        print '\tTo Train: \t' + argv[
            0] + ' <file> <fieldname> <rulesfile. empty "" to not save> "<good terms comma separated>" "<bad terms comma separated>"'
        print '\tTo Run: \t' + argv[0] + ' <file> <rulesfile>'
    # next best are equally weird.  using pattern at start of line before break or end of line after break
    elif len(beforeEnd) > 0:
        print "LINE_BREAKER = %s" % makeRegex(list(beforeEnd)[0], False)        
    elif len(afterStart) > 0:
        print "LINE_BREAKER = %s" % makeRegex(list(afterStart)[0], True)
        

if __name__ == '__main__':
    import sys
    if len(sys.argv) != 2:
        print 'Usage:  python %s "file of events"' % sys.argv[0]
        print '        file should break events with "-=X=-" on a separate line'
    else:
        events = []
        filename = sys.argv[1]
        lines = dcu.loadLines(filename)
        if lines == []:
            print "cannot get events"
            exit(1)
        event = ''
        for line in lines:
            line = line.strip()
            if line == '-=X=-':
                if event != '':
                    events.append(event)
                event = ''
            else:
                if event != '':
                    event += '\n'
                event += line