def main():
    global start, stop, topN, SENTENCES, WORDS, ENTITIES, ys_list

    if MEM_DEBUG:
        tracemalloc.start(10)

    parser = argparse.ArgumentParser(
        description='Runs basic analytics over pre-sorted n-grams')
    parser.add_argument(dest="N",
                        type=int,
                        help='Number of output phrases per metric')
    parser.add_argument(
        dest="grams",
        type=str,
        nargs="+",
        help="Gram types to include. Numerical or any of 'emnsuw'")
    parser.add_argument('-s',
                        dest="sentences",
                        action='store_const',
                        const=True,
                        default=False,
                        help='Examine sentences')
    parser.add_argument('-w',
                        dest="words",
                        action='store_const',
                        const=True,
                        default=False,
                        help='Examine words')
    parser.add_argument('-e',
                        dest="entities",
                        action='store_const',
                        const=True,
                        default=False,
                        help='Examine entities')

    util.add_arguments(parser)

    args = parser.parse_args()

    #Arguments:
    #analytics.py <MIN> <MAX> <N> (sw)
    #Finds the top N n-grams for each n \in [MIN .. MAX]
    #"s" in the last argument indicates including sentences, "w" words. Blank for nothing
    topN = args.N

    gram_list = []
    for gram in args.grams:
        try:
            #If it's an integer, add it
            gram_list.append(int(gram))
        except:
            for char in list(gram):
                if char not in "swenum":
                    raise Exception("Illegal gram: %s" % char)
                gram_list.append(char)

    util.process_arguments(args)
    util.CACHE_DB = False

    ys_list = list(util.iter_yearseason())

    do_analytics(gram_list)
Beispiel #2
0
            write_obj.append({"id": i, "domains": l})
            i += 1
        json.dump(write_obj, f)

def make_dirs(intervals):
    for ys in intervals:
        Path("../data/text_sim/%s/" % ys).mkdir(parents=True, exist_ok=True)

    

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Find and flag duplicate documents')

    parser.add_argument('intervals', type=str, nargs='+',
                                            help='Which intervals to process over. "all" scans all intervals in sequence')
    parser.add_argument('--sample', dest="sample_fdd", action='store_const', const=True, default=False, help='Sample')

    util.add_arguments(parser)
    args = parser.parse_args()
    intervals = args.intervals
    SAMPLE = args.sample_fdd
    util.process_arguments(args)
    if intervals[0] == "all":
        intervals = list(util.iter_yearseason())

    make_dirs(intervals)
        
    for interval in intervals:
        make_textsim_graph(interval)
    util.close_pool()
Beispiel #3
0
def main():

    global NO_PUNCTUATION

    if LOG_MEM:
        tracemalloc.start()

    logging.info("Starting at %s " % datetime.now().strftime("%H:%M:%S"))
    parser = argparse.ArgumentParser(
        description='Breaks documents into n-grams under a variety of fitlers')
    parser.add_argument('--start',
                        dest="MIN",
                        default=3,
                        type=int,
                        help='Analyze n-grams with n>=start')
    parser.add_argument('--stop',
                        dest="MAX",
                        default=9,
                        type=int,
                        help='Analyze n-grams with n<=stop')
    parser.add_argument(dest="intervals",
                        type=str,
                        nargs='+',
                        help='Intervals to collect n-grams over')
    parser.add_argument('-s',
                        dest="sentences",
                        action='store_const',
                        const=True,
                        default=False,
                        help='Examine sentences')
    parser.add_argument('-w',
                        dest="words",
                        action='store_const',
                        const=True,
                        default=False,
                        help='Examine words')
    parser.add_argument('-e',
                        dest="entities",
                        action='store_const',
                        const=True,
                        default=False,
                        help='Examine entities')

    util.add_arguments(parser)

    args = parser.parse_args()

    #Arguments:
    #analytics.py <MIN> <MAX> <N> (sw)
    #Finds the top N n-grams for each n \in [MIN .. MAX]
    #"s" in the last argument indicates including sentences, "w" words. Blank for nothing
    start = args.MIN
    stop = args.MAX + 1
    yearseasons = args.intervals
    SENTENCES = args.sentences
    WORDS = args.words
    ENTITIES = args.entities

    util.process_arguments(args)

    NO_PUNCTUATION = util.NO_PUNCTUATION
    MERGE_SIMILAR = util.MERGE_SIMILAR
    clean = "_CL" if util.USE_CLEAN else ""
    np = "_NP" if NO_PUNCTUATION else ""

    global stopwords
    stopwords = set(nltk.corpus.stopwords.words('english'))
    cleaned_words = set(["_organization_", "_number_", "_url_", "_email_"])
    stopwords.update(cleaned_words)

    try:
        os.mkdir("../data/%s/" % yearseason)
    except:
        pass

    gram_groups = [[n] for n in range(start, stop)]
    if SENTENCES:
        gram_groups.append(["s"])
    if WORDS:
        gram_groups.append(["w"])
    if ENTITIES:
        gram_groups.append(["e", "m", "u"])

    #Decide how much we're going to iterate
    if yearseasons[0] == "all":
        logging.info("Removing old data at %s " %
                     datetime.now().strftime("%H:%M:%S"))
        ioutils.remove_grams()
        logging.info("Done removing old data at %s" %
                     datetime.now().strftime("%H:%M:%S"))
        intervals = [t for t in util.iter_year_season()]
    else:
        intervals = []
        for yearseason in yearseasons:
            year = int(yearseason[:4])
            if len(yearseason) == 5:
                season = yearseason[4]
                intervals.append((year, season))
            elif len(yearseason) == 4:
                intervals.append((year, 'A'))
                intervals.append((year, 'B'))
            else:
                logging.error("Error on %s\n" % yearseason)

    generate_gram_list(gram_groups, intervals)

    #logging.info("Closing DB at %s " % datetime.now().strftime("%H:%M:%S"))
    #ioutils.close_db()
    #logging.info("Finished at %s"  % datetime.now().strftime("%H:%M:%S"))
    if LOG_MEM:
        print("Max memory usage:")
        print("Current: %s, Peak: %s" % tuple(
            (tracemalloc._format_size(m, False)
             for m in tracemalloc.get_traced_memory())))