Example #1
0
def generate_gram_list(gram_groups, intervals):
    if LOG_MEM:
        logging.info("Memory: current: %s, peak: %s" % tuple(
            (tracemalloc._format_size(m, False)
             for m in tracemalloc.get_traced_memory())))

    policies_db_cache = ioutils.load_clean_policies_db()

    for gram_list in gram_groups:  #[sum(gram_groups,[])]:
        n_str = get_n_str(gram_list)
        logging.info("Starting counts for %s-grams at %s" %
                     (n_str, datetime.now().strftime("%H:%M:%S")))
        for year, season in intervals:
            yearseason = "%d%s" % (year, season)

            ngrams = generate_grams(gram_list,
                                    yearseason,
                                    policies_db_cache=policies_db_cache)
            #Write changes
            write_grams(ngrams, year, season)
            #Save changes to disk to save memory
            #ioutils.close_db(year=year,season=season)
            ngrams = None
        logging.info("Done with counts for %s-grams at %s" %
                     (n_str, datetime.now().strftime("%H:%M:%S")))

        if LOG_MEM:
            gc.collect(
            )  # Collect garbage so we know the memory usage is accurate
            logging.info("Memory: current: %s, peak: %s" % tuple(
                (tracemalloc._format_size(m, False)
                 for m in tracemalloc.get_traced_memory())))
Example #2
0
 def format_size(self, role, size, diff):
     if role == SORT_ROLE:
         return size
     if role == Qt.ToolTipRole:
         if abs(size) < 10 * 1024:
             return None
         if diff:
             return "%+i" % size
         else:
             return str(size)
     return tracemalloc._format_size(size, diff)
Example #3
0
    def set_stats(self, snapshot1, snapshot2, group_by, cumulative):
        self.emit(QtCore.SIGNAL("layoutAboutToBeChanged()"))

        if snapshot1 is not None:
            if snapshot2 is not None:
                stats = snapshot2.compare_to(snapshot1, group_by, cumulative)
            else:
                stats = snapshot1.statistics(group_by, cumulative)
            self.stats = stats
            self.diff = isinstance(stats[0], tracemalloc.StatisticDiff)
            self.total = sum(stat.size for stat in self.stats)
            self.total_text = tracemalloc._format_size(self.total, False)
            if snapshot2 is not None:
                total1 = sum(trace.size for trace in snapshot1.traces)
                total2 = self.total
                self.total_text += ' (%s)' % tracemalloc._format_size(
                    total2 - total1, True)
        else:
            self.stats = ()
            self.diff = False
            self.total = 0
            self.total_text = tracemalloc._format_size(0, False)

        self.group_by = group_by
        if self.group_by == 'traceback':
            source = self.tr("Traceback")
        elif self.group_by == 'lineno':
            source = self.tr("Line")
        else:
            source = self.tr("Filename")
        self.headers = [source, self.tr("Size")]
        if self.diff:
            self.headers.append(self.tr("Size Diff"))
        self.headers.append(self.tr("Count"))
        if self.diff:
            self.headers.append(self.tr("Count Diff"))
        self.headers.extend([self.tr("Item Size"), self.tr("%Total")])

        self.emit(QtCore.SIGNAL("layoutChanged()"))
Example #4
0
def display():
    # snapshot = t.take_snapshot().filter_traces(
    #     (
    #         t.Filter(False, "<frozen importlib._bootstrap>"),
    #         t.Filter(False, "<unknown>"),
    #     )
    # )
    snapshot = t.take_snapshot()
    for stat in snapshot.statistics("lineno", cumulative=False)[:5]:
        print("----------------------------------------")
        print(t._format_size(stat.size, False))
        for line in stat.traceback.format():
            print(line)
    print("========================================")
Example #5
0
    def get_label(self):
        if self.ntraces is None:
            print("Process snapshot %s..." % self.filename)
            # fill ntraces and total
            self.load()
            self.unload()
            print("Process snapshot %s... done" % self.filename)

        name = os.path.basename(self.filename)
        infos = [
            tracemalloc._format_size(self.total, False),
            fmt(tr("%s traces"), self.ntraces),
            str(self.timestamp),
        ]
        return "%s (%s)" % (name, ', '.join(infos))
Example #6
0
def loop(*, size, times):
    for i in range(times):
        print(len(L))
        g(size)
        print([t._format_size(x, False) for x in t.get_traced_memory()])
        snapshot = t.take_snapshot().filter_traces(
            (
                t.Filter(False, "<frozen importlib._bootstrap>"),
                t.Filter(False, "*tracemalloc*"),
                t.Filter(False, "*linecache*"),
                t.Filter(False, "*sre_*"),
                t.Filter(False, "*re.py"),
                t.Filter(False, "*fnmatch*"),
                t.Filter(False, "*tokenize*"),
                t.Filter(False, "<unknown>"),
            )
        )

        for stat in snapshot.statistics("lineno", cumulative=False)[:3]:
            print("----------------------------------------")
            print(t._format_size(stat.size, False))
            for line in stat.traceback.format():
                print(line)
        print("========================================")
Example #7
0
import tracemalloc as t

print("*start")
print([t._format_size(x, False) for x in t.get_traced_memory()])
t.start()

L = [[_ for _ in range(10000)] for i in range(100)]
print("*gen")
print([t._format_size(x, False) for x in t.get_traced_memory()])

snapshot = t.take_snapshot()
for stats in snapshot.statistics("traceback")[:3]:
    print(stats)

print("----------------------------------------")
snapshot = t.take_snapshot()
for stats in snapshot.statistics("lineno", cumulative=True)[:3]:
    print(stats)

t.stop()
print([t._format_size(x, False) for x in t.get_traced_memory()])
def mem_count():
    if MEM_DEBUG:
        logging.info("Memory: current: %s, peak: %s" % tuple(
            (tracemalloc._format_size(m, False)
             for m in tracemalloc.get_traced_memory())))
Example #9
0
import json
from collections import defaultdict, namedtuple
from pympler.asizeof import asizeof
from tracemalloc import _format_size

idmap = defaultdict(lambda: str(len(idmap)))


def on_pairs(pairs, named={}):
    fields = tuple(sorted([p[0].lstrip("_") for p in pairs]))
    k = idmap[fields]
    if k not in named:
        named[k] = namedtuple("N" + k, " ".join(fields))
    return named[k](*[p[1] for p in pairs])


filename = "citylots.json"  # size is 181 Mib
with open(filename) as rf:
    d0 = json.load(rf, object_pairs_hook=on_pairs)

print(len(idmap))
print(type(d0), _format_size(asizeof(d0), False))
# 4
# <class '__main__.N3'> 638 MiB

# real    1m45.924s
# user    1m38.756s
# sys     0m5.089
Example #10
0
import json
from tracemalloc import _format_size
from pympler.asizeof import asizeof

filename = "citylots.json"  # size is 181 Mib
with open(filename) as rf:
    d0 = json.load(rf)

print(type(d0), _format_size(asizeof(d0), False))
# <class 'dict'> 795 MiB

# real    1m31.358s
# user    1m25.421s
# sys     0m4.634s
Example #11
0
def loop(*, size, times):
    for i in range(times):
        logger.info(
            "memory (current, peak) %s",
            str([t._format_size(x, False) for x in t.get_traced_memory()]))
        g(size)
Example #12
0
def main():

    global NO_PUNCTUATION

    if LOG_MEM:
        tracemalloc.start()

    logging.info("Starting at %s " % datetime.now().strftime("%H:%M:%S"))
    parser = argparse.ArgumentParser(
        description='Breaks documents into n-grams under a variety of fitlers')
    parser.add_argument('--start',
                        dest="MIN",
                        default=3,
                        type=int,
                        help='Analyze n-grams with n>=start')
    parser.add_argument('--stop',
                        dest="MAX",
                        default=9,
                        type=int,
                        help='Analyze n-grams with n<=stop')
    parser.add_argument(dest="intervals",
                        type=str,
                        nargs='+',
                        help='Intervals to collect n-grams over')
    parser.add_argument('-s',
                        dest="sentences",
                        action='store_const',
                        const=True,
                        default=False,
                        help='Examine sentences')
    parser.add_argument('-w',
                        dest="words",
                        action='store_const',
                        const=True,
                        default=False,
                        help='Examine words')
    parser.add_argument('-e',
                        dest="entities",
                        action='store_const',
                        const=True,
                        default=False,
                        help='Examine entities')

    util.add_arguments(parser)

    args = parser.parse_args()

    #Arguments:
    #analytics.py <MIN> <MAX> <N> (sw)
    #Finds the top N n-grams for each n \in [MIN .. MAX]
    #"s" in the last argument indicates including sentences, "w" words. Blank for nothing
    start = args.MIN
    stop = args.MAX + 1
    yearseasons = args.intervals
    SENTENCES = args.sentences
    WORDS = args.words
    ENTITIES = args.entities

    util.process_arguments(args)

    NO_PUNCTUATION = util.NO_PUNCTUATION
    MERGE_SIMILAR = util.MERGE_SIMILAR
    clean = "_CL" if util.USE_CLEAN else ""
    np = "_NP" if NO_PUNCTUATION else ""

    global stopwords
    stopwords = set(nltk.corpus.stopwords.words('english'))
    cleaned_words = set(["_organization_", "_number_", "_url_", "_email_"])
    stopwords.update(cleaned_words)

    try:
        os.mkdir("../data/%s/" % yearseason)
    except:
        pass

    gram_groups = [[n] for n in range(start, stop)]
    if SENTENCES:
        gram_groups.append(["s"])
    if WORDS:
        gram_groups.append(["w"])
    if ENTITIES:
        gram_groups.append(["e", "m", "u"])

    #Decide how much we're going to iterate
    if yearseasons[0] == "all":
        logging.info("Removing old data at %s " %
                     datetime.now().strftime("%H:%M:%S"))
        ioutils.remove_grams()
        logging.info("Done removing old data at %s" %
                     datetime.now().strftime("%H:%M:%S"))
        intervals = [t for t in util.iter_year_season()]
    else:
        intervals = []
        for yearseason in yearseasons:
            year = int(yearseason[:4])
            if len(yearseason) == 5:
                season = yearseason[4]
                intervals.append((year, season))
            elif len(yearseason) == 4:
                intervals.append((year, 'A'))
                intervals.append((year, 'B'))
            else:
                logging.error("Error on %s\n" % yearseason)

    generate_gram_list(gram_groups, intervals)

    #logging.info("Closing DB at %s " % datetime.now().strftime("%H:%M:%S"))
    #ioutils.close_db()
    #logging.info("Finished at %s"  % datetime.now().strftime("%H:%M:%S"))
    if LOG_MEM:
        print("Max memory usage:")
        print("Current: %s, Peak: %s" % tuple(
            (tracemalloc._format_size(m, False)
             for m in tracemalloc.get_traced_memory())))
Example #13
0
def handle_traceback(sig, frame):
    logger.info(
        "memory (current, peak) %s", str([t._format_size(x, False) for x in t.get_traced_memory()])
    )
    import traceback
    traceback.print_stack(limit=5)
Example #14
0
def tick():
    while True:
        logger.info("%s", str([t._format_size(x, False) for x in t.get_traced_memory()]))
        time.sleep(0.2)
Example #15
0
def tick():
    while True:
        logger.info(
            "%s",
            str([t._format_size(x, False) for x in t.get_traced_memory()]))
        time.sleep(0.2)
Example #16
0
 def update_event(self, inp=-1):
     self.set_output_val(
         0, tracemalloc._format_size(self.input(0), self.input(1)))
Example #17
0
def handle_traceback(sig, frame):
    logger.info("memory (current, peak) %s",
                str([t._format_size(x, False) for x in t.get_traced_memory()]))
    import traceback
    traceback.print_stack(limit=5)