Example #1
1
class PsParser:
    """ parse ps output """

    def __init__(self, command="ps -Al"):
        output = subprocess.check_output(command, shell=True)
        self.result = {}
        self.headers = OrderedDict()
        processes = output.splitlines()
        nfields = len(processes[0].split()) - 1
        self.lines = []
        for k, row in enumerate(processes):
            row = row.decode("utf-8")
            data = row.split(None, nfields)
            if k == 0:
                self.headers = data
                for k, header in enumerate(data):
                    self.result[header] = []
                continue
            self.lines.append(data)
            for k, value in enumerate(data):
                self.result[self.headers[k]].append(value)

    def get_pid(self, pid):
        result = [Proc(k, self) for k in self.lines if int(k[self.headers.index("PID")]) == pid]
        return result[0] if result else None
Example #2
1
def export_silo(request, id):

    silo_name = Silo.objects.get(id=id).name

    response = HttpResponse(content_type="text/csv")
    response["Content-Disposition"] = 'attachment; filename="%s.csv"' % silo_name
    writer = csv.writer(response)

    silo_data = LabelValueStore.objects(silo_id=id)
    data = []
    num_cols = 0
    cols = OrderedDict()
    if silo_data:
        num_rows = len(silo_data)

        for row in silo_data:
            for i, col in enumerate(row):
                if col not in cols.keys():
                    num_cols = num_cols + 1
                    cols[col] = num_cols

        # Convert OrderedDict to Python list so that it can be written to CSV writer.
        cols = list(cols)
        writer.writerow(list(cols))

        # Populate a 2x2 list structure that corresponds to the number of rows and cols in silo_data
        for i in xrange(num_rows):
            data += [[0] * num_cols]

        for r, row in enumerate(silo_data):
            for col in row:
                # Map values to column names and place them in the correct position in the data array
                data[r][cols.index(col)] = smart_str(row[col])
            writer.writerow(data[r])
    return response
Example #3
1
    def ranking(self):
        content_a = [word.strip() for word in open(self.wordset_a)]
        content_b = [word.strip() for word in open(self.wordset_b)]

        result_matrix = self.result_matrix.todense()
        truth_matrix = self.truth_matrix.todense()
        row = 0
        targets = []
        rankings = []
        result_word_list = []
        truth_word_list = []

        for i in content_a:
            targets.append(i)
            column = 0
            result_dict = {}
            truth_dict = {}

            for j in content_b:

                result_dict[str(j)] = result_matrix[row, column]
                truth_dict[str(j)] = truth_matrix[row, column]
                column += 1

            result_sort = OrderedDict(reversed(sorted(result_dict.items(), key=lambda t: np.float(t[1])))).keys()

            truth_sort = OrderedDict(reversed(sorted(truth_dict.items(), key=lambda t: np.float(t[1])))).keys()

            result_words = []
            truth_words = []
            iteration = 0
            rank = 0
            rank_count = 0
            tr_rank = 0

            for l in range(0, 10):

                result_words.append(result_sort[l])
                truth_words.append(truth_sort[l])
                rank_count += result_sort.index(truth_sort[l]) + 1
                iteration += 1
                tr_rank += iteration

            rank = float(rank_count / 10.0)
            reference = float(tr_rank / 10.0)
            result_word_list.append(result_words)
            truth_word_list.append(truth_words)
            rankings.append(rank)

            row += 1

        avg_rank = float(sum(rankings) / len(rankings))

        return reference, avg_rank, rankings, result_word_list, truth_word_list, targets
Example #4
1
def export_silo(request, id):

    silo_name = Silo.objects.get(id=id).name

    response = HttpResponse(content_type="text/csv")
    response["Content-Disposition"] = 'attachment; filename="%s.csv"' % silo_name
    writer = csv.writer(response)

    # Loads the bson objects from mongo
    bsondata = store.find({"silo_id": int(id)})
    # Now convert bson to json string using OrderedDict to main fields order
    json_string = dumps(bsondata)
    # Now decode the json string into python object
    silo_data = json.loads(json_string, object_pairs_hook=OrderedDict)
    data = []
    num_cols = 0
    cols = OrderedDict()
    if silo_data:
        num_rows = len(silo_data)

        for row in silo_data:
            for i, col in enumerate(row):
                if col not in cols.keys():
                    num_cols += 1
                    col = col.decode("latin-1").encode("utf8")
                    cols[col] = num_cols

        # Convert OrderedDict to Python list so that it can be written to CSV writer.
        cols = list(cols)
        writer.writerow(list(cols))

        # Populate a 2x2 list structure that corresponds to the number of rows and cols in silo_data
        for i in xrange(num_rows):
            data += [[0] * num_cols]

        for r, row in enumerate(silo_data):
            for col in row:
                # Map values to column names and place them in the correct position in the data array
                val = row[col]
                if isinstance(val, OrderedDict):
                    val = val.popitem()
                if isinstance(val, tuple):
                    if val[0] == "$date":
                        val = smart_text(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(val[1] / 1000)))
                    if val[0] == "$oid":
                        val = smart_text(val[1])
                # val = val.decode("latin-1").encode("utf8")
                val = smart_text(val).decode("latin-1").encode("utf8")
                data[r][cols.index(col)] = val
            writer.writerow(data[r])
    return response
Example #5
1
def rank(Input1, Input2, D, R):

    f = h5py.File("all_data", "w")
    content = [word.strip() for word in open(Input1)]
    test_content = [word.strip() for word in open(Input2)]

    k = 0
    rank = []
    tr_rank = []
    ResultMatrix = R.todense()
    TruthMatrix = D.todense()
    ranking = []
    for i in content:
        l = 0
        d = {}
        e = {}
        r = 0
        rc = 0
        tr = 0
        iter = 0
        trr = 0
        iterr = 0
        #        print i

        #        print "\t Truth \t\t Calculation"
        #        print "\t________________________________"
        for j in test_content:
            d[str(j)] = ResultMatrix[k, l]
            e[str(j)] = TruthMatrix[k, l]
            l += 1

        C = OrderedDict(reversed(sorted(d.items(), key=lambda t: np.float(t[1])))).keys()

        T = OrderedDict(reversed(sorted(e.items(), key=lambda t: np.float(t[1])))).keys()
        tar_words = []
        list_rank = 0

        for m in range(0, 5):
            print "\t", T[m], "\t\t", C[m]
            tar_words.append(C[m])
            rc += C.index(T[m]) + 1

            iter += 1
            tr += iter
            list_rank = float(rc / 5.0)
            ranking.append(list_rank)
        print "\t_________________________________"
        print "\t", tr, "\t\t", rc, "\t", list_rank
        k += 1

        #        list_ind = []
        #        words = []
        #
        #        for n in T:
        #            x = (C.index(n) - T.index(n))
        #            r += x ** 2
        #            list_ind.append((C.index(n) + 1))
        #            words.append(n)
        #            iterr += 1
        #            trr += iterr

        label_rank = i + "_rank"
        label_words = i + "_words"

        f.create_dataset(label_rank, data=list_rank)
        f.create_dataset(label_words, data=tar_words)

    avg_rank = float(sum(ranking) / 337.0)
    label_avg_rank = "_avg_rank"
    f.create_dataset(label_avg_rank, data=avg_rank)
    print "Average Rank", avg_rank
    f.close()
Example #6
1
def pmultiquery(
    corpus,
    search,
    show="words",
    query="any",
    sort_by="total",
    quicksave=False,
    multiprocess="default",
    function_filter=False,
    just_speakers=False,
    root=False,
    note=False,
    print_info=True,
    **kwargs
):
    """Parallel process multiple queries or corpora.

    This function is used by interrogator() if:

        a) path is a list of paths
        b) query is a dict of named queries
        c) just speakers == 'each', or a list of speakers with len(list) > 1
    
    This function needs joblib 0.8.4 or above in order to run properly.
    There's no reason to call it yourself."""

    import collections
    import os
    import pandas as pd
    import collections
    from collections import namedtuple
    from time import strftime, localtime
    import corpkit
    from interrogator import interrogator
    from editor import editor
    from other import save
    from interrogation import Interrogation

    try:
        from joblib import Parallel, delayed
    except:
        pass
        # raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \
        #                 'Install with:\n\n        pip install joblib')
    import multiprocessing

    def best_num_parallel(num_cores, num_queries):
        import corpkit

        """decide how many parallel processes to run

        the idea, more or less, is to balance the load when possible"""
        if num_queries <= num_cores:
            return num_queries
        if num_queries > num_cores:
            if (num_queries / num_cores) == num_cores:
                return int(num_cores)
            if num_queries % num_cores == 0:
                try:
                    return max([int(num_queries / n) for n in range(2, num_cores) if int(num_queries / n) <= num_cores])
                except ValueError:
                    return num_cores
            else:
                import math

                if (float(math.sqrt(num_queries))).is_integer():
                    square_root = math.sqrt(num_queries)
                    if square_root <= num_queries / num_cores:
                        return int(square_root)
        return num_cores

    num_cores = multiprocessing.cpu_count()

    # what is our iterable? ...
    multiple_option = False
    multiple_queries = False
    multiple_speakers = False
    multiple_corpora = False
    multiple_search = False
    mult_corp_are_subs = False
    denom = 1

    if hasattr(corpus, "__iter__"):
        multiple_corpora = True
        num_cores = best_num_parallel(num_cores, len(corpus))
        denom = len(corpus)
        if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus):
            mult_corp_are_subs = True
    elif (type(query) == list or type(query) == dict) and not hasattr(search, "__iter__"):
        multiple_queries = True
        num_cores = best_num_parallel(num_cores, len(query))
        denom = len(query)
    elif hasattr(search, "__iter__") and type(search) != dict:
        multiple_search = True
        num_cores = best_num_parallel(num_cores, len(list(search.keys())))
        denom = len(list(search.keys()))
    elif hasattr(function_filter, "__iter__"):
        multiple_option = True
        num_cores = best_num_parallel(num_cores, len(list(function_filter.keys())))
        denom = len(list(function_filter.keys()))
    elif just_speakers:
        from build import get_speaker_names_from_xml_corpus

        multiple_speakers = True
        if just_speakers == "each" or just_speakers == ["each"]:
            just_speakers = get_speaker_names_from_xml_corpus(corpus.path)
        if len(just_speakers) == 0:
            print("No speaker name data found.")
            return
        num_cores = best_num_parallel(num_cores, len(just_speakers))
        denom = len(just_speakers)

    if type(multiprocess) == int:
        num_cores = multiprocess
    if multiprocess is False:
        num_cores = 1

    # make sure quicksaves are right type
    if quicksave is True:
        raise ValueError("quicksave must be string when using pmultiquery.")

    # the options that don't change
    d = {
        #'paralleling': True,
        "function": "interrogator",
        "root": root,
        "note": note,
        "denominator": denom,
    }

    # add kwargs to query
    for k, v in list(kwargs.items()):
        d[k] = v

    # make a list of dicts to pass to interrogator,
    # with the iterable unique in every one
    ds = []
    if multiple_corpora:
        for index, p in enumerate(corpus):
            name = p.name
            a_dict = dict(d)
            a_dict["corpus"] = p
            a_dict["search"] = search
            a_dict["query"] = query
            a_dict["show"] = show
            a_dict["outname"] = name.replace("-parsed", "")
            a_dict["just_speakers"] = just_speakers
            a_dict["paralleling"] = index
            a_dict["printstatus"] = False
            ds.append(a_dict)
    elif multiple_queries:
        for index, (name, q) in enumerate(query.items()):
            a_dict = dict(d)
            a_dict["corpus"] = corpus
            a_dict["search"] = search
            a_dict["query"] = q
            a_dict["show"] = show
            a_dict["outname"] = name
            a_dict["just_speakers"] = just_speakers
            a_dict["paralleling"] = index
            a_dict["printstatus"] = False
            ds.append(a_dict)
    elif multiple_option:
        for index, (name, q) in enumerate(function_filter.items()):
            a_dict = dict(d)
            a_dict["corpus"] = corpus
            a_dict["search"] = search
            a_dict["query"] = query
            a_dict["show"] = show
            a_dict["outname"] = name
            a_dict["just_speakers"] = just_speakers
            a_dict["paralleling"] = index
            a_dict["function_filter"] = q
            a_dict["printstatus"] = False
            ds.append(a_dict)
    elif multiple_speakers:
        for index, name in enumerate(just_speakers):
            a_dict = dict(d)
            a_dict["corpus"] = corpus
            a_dict["search"] = search
            a_dict["query"] = query
            a_dict["show"] = show
            a_dict["outname"] = name
            a_dict["just_speakers"] = [name]
            a_dict["function_filter"] = function_filter
            a_dict["paralleling"] = index
            a_dict["printstatus"] = False
            ds.append(a_dict)
    elif multiple_search:
        for index, val in enumerate(search):
            a_dict = dict(d)
            a_dict["corpus"] = corpus
            a_dict["search"] = val
            a_dict["query"] = query
            a_dict["show"] = show
            a_dict["outname"] = name
            a_dict["just_speakers"] = just_speakers
            a_dict["function_filter"] = function_filter
            a_dict["paralleling"] = index
            a_dict["printstatus"] = False
            ds.append(a_dict)

    if kwargs.get("do_concordancing") is False:
        message = "Interrogating"
    elif kwargs.get("do_concordancing") is True:
        message = "Interrogating and concordancing"
    elif kwargs.get("do_concordancing").lower() == "only":
        message = "Concordancing"
    time = strftime("%H:%M:%S", localtime())
    sformat = ""
    for i, (k, v) in enumerate(list(search.items())):
        if type(v) == list:
            vformat = ", ".join(v[:5])
            if len(v) > 5:
                vformat += " ..."
        else:
            vformat = v
        sformat += "%s: %s" % (k, vformat)
        if i < len(search.keys()) - 1:
            sformat += "\n                  "

    if multiple_corpora and not multiple_option:
        print(
            (
                "\n%s: Beginning %d corpus interrogations (in %d parallel processes):\n              %s"
                "\n          Query: '%s'\n          %s corpus ... \n"
                % (time, len(corpus), num_cores, "\n              ".join([i.name for i in corpus]), sformat, message)
            )
        )

    elif multiple_queries:
        print(
            (
                "\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s"
                "\n          Queries: '%s'\n          %s corpus ... \n"
                % (time, len(search), num_cores, corpus.name, "', '".join(list(search.values())), message)
            )
        )

    elif multiple_search:
        print(
            (
                "\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s"
                "\n          Queries: '%s'\n          %s corpus ... \n"
                % (time, len(list(search.keys())), num_cores, corpus.name, str(list(search.values())), message)
            )
        )

    elif multiple_option:
        print(
            (
                "\n%s: Beginning %d parallel corpus interrogations (multiple options): %s"
                "\n          Query: '%s'\n          %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message)
            )
        )

    elif multiple_speakers:
        print(
            (
                "\n%s: Beginning %d parallel corpus interrogations: %s"
                "\n          Query: '%s'\n          %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message)
            )
        )

    # run in parallel, get either a list of tuples (non-c option)
    # or a dataframe (c option)
    # import sys
    # reload(sys)
    # stdout=sys.stdout
    failed = False
    terminal = False
    used_joblib = False
    # ds = ds[::-1]
    if not root:
        from blessings import Terminal

        terminal = Terminal()
        print("\n" * (len(ds) - 2))
        for dobj in ds:
            linenum = dobj["paralleling"]
            # this try handles nosetest problems in sublime text
            try:
                with terminal.location(0, terminal.height - (linenum + 1)):
                    # this is a really bad idea.
                    thetime = strftime("%H:%M:%S", localtime())
                    num_spaces = 26 - len(dobj["outname"])
                    print("%s: QUEUED: %s" % (thetime, dobj["outname"]))

            except:
                pass

    if not root and multiprocess:
        # res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
        try:
            # ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True)
            res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
            used_joblib = True
        except:
            failed = True
            print("Multiprocessing failed.")
            raise
        if not res:
            failed = True
    else:
        res = []
        for index, d in enumerate(ds):
            d["startnum"] = (100 / denom) * index
            res.append(interrogator(**d))
        try:
            res = sorted(res)
        except:
            pass

    # multiprocessing way
    # from multiprocessing import Process
    # from interrogator import interrogator
    # jobs = []
    ##for d in ds:
    ##    p = multiprocessing.Process(target=interrogator, kwargs=(**d,))
    ##    jobs.append(p)
    ##    p.start()
    ##    while p.is_alive():
    ##        import time
    ##        time.sleep(2)
    ##        if root:
    ##            root.update()
    # result_queue = multiprocessing.Queue()
    #
    # for d in ds:
    # funs = [interrogator(result_queue, **kwargs) for kwargs in ds]
    # jobs = [multiprocessing.Process(mc) for mc in funs]
    # for job in jobs: job.start()
    # for job in jobs: job.join()
    # results = [result_queue.get() for mc in funs]

    import corpkit
    from interrogation import Concordance

    if kwargs.get("do_concordancing") == "only":
        concs = pd.concat([x for x in res])
        thetime = strftime("%H:%M:%S", localtime())
        print("\n\n%s: Finished! %d results.\n\n" % (thetime, len(concs.index)))
        return Concordance(concs)

    from collections import OrderedDict

    if not all(type(i.results) == pd.core.series.Series for i in res):
        out = OrderedDict()
        for interrog, d in zip(res, ds):
            for unpicklable in ["note", "root"]:
                interrog.query.pop(unpicklable, None)
            out[interrog.query["outname"]] = interrog

        if quicksave:
            fullpath = os.path.join("saved_interrogations", quicksave)
            while os.path.isdir(fullpath):
                selection = input(
                    "\nSave error: %s already exists in %s.\n\nType 'o' to overwrite, or enter a new name: "
                    % (quicksave, "saved_interrogations")
                )
                if selection == "o" or selection == "O":
                    import shutil

                    shutil.rmtree(fullpath)
                else:
                    import os

                    fullpath = os.path.join("saved_interrogations", selection)

            for k, v in list(out.items()):
                save(v, k, savedir=fullpath, print_info=False)

            time = strftime("%H:%M:%S", localtime())
            print("\n%s: %d files saved to %s" % (time, len(list(out.keys())), fullpath))

        time = strftime("%H:%M:%S", localtime())
        print(
            "\n\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n"
            % (time, "'\n         '".join(sorted(out.keys())))
        )
        from interrogation import Interrodict

        return Interrodict(out)
    # make query and total branch, save, return
    else:
        # print sers
        # print ds
        if multiple_corpora and not mult_corp_are_subs:
            sers = [i.results for i in res]
            out = pd.DataFrame(sers, index=[i.query["outname"] for i in res])
            out = out.reindex_axis(sorted(out.columns), axis=1)  # sort cols
            out = out.fillna(0)  # nan to zero
            out = out.astype(int)  # float to int
            out = out.T
        else:
            out = pd.concat([r.results for r in res], axis=1)
            # format like normal
            out = out[sorted(list(out.columns))]
            out = out.T
            out = out.fillna(0)  # nan to zero
            out = out.astype(int)
            if "c" in show and mult_corp_are_subs:
                out = out.sum()
                out.index = sorted(list(out.index))

        # sort by total
        if type(out) == pd.core.frame.DataFrame:
            out.ix["Total-tmp"] = out.sum()
            tot = out.ix["Total-tmp"]
            out = out[tot.argsort()[::-1]]
            out = out.drop("Total-tmp", axis=0)
        out = out.edit(sort_by=sort_by, print_info=False, keep_stats=False, df1_always_df=kwargs.get("df1_always_df"))
        if len(out.results.columns) == 1:
            out.results = out.results.sort_index()
        if kwargs.get("do_concordancing") is True:
            concs = pd.concat([x.concordance for x in res], ignore_index=True)
            concs = concs.sort_values(by="c")
            concs = concs.reset_index(drop=True)
            out.concordance = Concordance(concs)
        thetime = strftime("%H:%M:%S", localtime())
        if terminal:
            with terminal.location(0, terminal.height):
                print(
                    "\n\n%s: Finished! %d unique results, %d total.%s"
                    % (thetime, len(out.results.columns), out.totals.sum(), "\n")
                )
        else:
            print(
                "\n\n%s: Finished! %d unique results, %d total.%s"
                % (thetime, len(out.results.columns), out.totals.sum(), "\n")
            )
        if used_joblib:
            print("\n" * (len(ds) - 3))
        if quicksave:
            from other import save

            save(out, quicksave)
        return out