Example #1
0
    def testSeriesNested(self):
        s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6,7,8,9,10,15])
        s.sort()

        nested = {'s1': s, 's2': s.copy()}

        exp = {'s1': ujson.decode(ujson.encode(s)),
               's2': ujson.decode(ujson.encode(s))}
        self.assertTrue(ujson.decode(ujson.encode(nested)) == exp)

        exp = {'s1': ujson.decode(ujson.encode(s, orient="split")),
               's2': ujson.decode(ujson.encode(s, orient="split"))}
        self.assertTrue(ujson.decode(ujson.encode(nested, orient="split")) == exp)

        exp = {'s1': ujson.decode(ujson.encode(s, orient="records")),
               's2': ujson.decode(ujson.encode(s, orient="records"))}
        self.assertTrue(ujson.decode(ujson.encode(nested, orient="records")) == exp)

        exp = {'s1': ujson.decode(ujson.encode(s, orient="values")),
               's2': ujson.decode(ujson.encode(s, orient="values"))}
        self.assertTrue(ujson.decode(ujson.encode(nested, orient="values")) == exp)

        exp = {'s1': ujson.decode(ujson.encode(s, orient="index")),
               's2': ujson.decode(ujson.encode(s, orient="index"))}
        self.assertTrue(ujson.decode(ujson.encode(nested, orient="index")) == exp)
def hamm_entropy_kNN(X,y,kNN=3):
    d1 = hamm_entropy_Xy(X,y)
    d1 = Series(d1)
    d1.sort(ascending=True)
    ixs = d1.index
    d1_kNN = d1[0:kNN]
    ixs_kNN = ixs[0:kNN]
    return (np.array(d1_kNN), ixs_kNN)
Example #3
0
def Main():
    client = github_helpers.authenticate()
    keywords = raw_input("Please, enter keywords to search repositories: ")
    if keywords is '':
        keywords = 'javascript'
        print 'No keywords provided. It will use the keyword: ' + keywords
    search = client.search_repositories(keywords)
    first_page = search.get_page(0)

    languages = Series(r.language for r in first_page)
    languages = languages.dropna()
    languages.sort()

    percentages = (100.0 * languages.value_counts() / len(languages)).map(
        '{:,.2f} %'.format)

    print 'Languages percentage:'
    print percentages

    # Create plot
    x = [int(r.stargazers_count) for r in first_page]
    y = [int(r.forks) for r in first_page]

    # Add one to every value for logarithmic scale
    x = [val + 1 for val in x]
    y = [val + 1 for val in y]

    area = [100 for r in first_page]
    names = [r.name for r in first_page]
    colors = np.random.rand(len(first_page))
    pl.scatter(x, y, s=area, c=colors, alpha=0.5)
    for i in range(0, len(x)):
        pl.annotate(names[i], (x[i], y[i]), fontsize=2)
    pl.title("All values are with addition of 1 (for the logarithmic scale)")
    pl.xlabel("Stars")
    pl.xscale("log")
    pl.yscale("log")
    pl.ylabel("Forks")
    pl.tight_layout()
    filepath = 'reports/APIs/github'
    if not os.path.isdir(filepath): os.makedirs(filepath)
    filepath += '/search_repositories.png'
    pl.savefig(filepath, figsize=(1020, 1020), dpi=300)
    pl.close()
    print(
        'A chart with high resolution and small font size (to minimize overlaps) was created at '
        + filepath)
Example #4
0
    def test_ix_align(self):
        from pandas import Series
        b = Series(np.random.randn(10))
        b.sort()
        df_orig = Panel(np.random.randn(3, 10, 2))
        df = df_orig.copy()

        df.ix[0, :, 0] = b
        assert_series_equal(df.ix[0, :, 0].reindex(b.index), b)

        df = df_orig.swapaxes(0, 1)
        df.ix[:, 0, 0] = b
        assert_series_equal(df.ix[:, 0, 0].reindex(b.index), b)

        df = df_orig.swapaxes(1, 2)
        df.ix[0, 0, :] = b
        assert_series_equal(df.ix[0, 0, :].reindex(b.index), b)
Example #5
0
def main():
    import shutil
    import tempfile
    import warnings

    from pandas import Series

    from vbench.api import BenchmarkRunner
    from suite import (REPO_PATH, BUILD, DB_PATH, PREPARE, dependencies,
                       benchmarks)

    from memory_profiler import memory_usage

    warnings.filterwarnings('ignore', category=FutureWarning)

    try:
        TMP_DIR = tempfile.mkdtemp()
        runner = BenchmarkRunner(
            benchmarks,
            REPO_PATH,
            REPO_PATH,
            BUILD,
            DB_PATH,
            TMP_DIR,
            PREPARE,
            always_clean=True,
            # run_option='eod', start_date=START_DATE,
            module_dependencies=dependencies)
        results = {}
        for b in runner.benchmarks:
            k = b.name
            try:
                vs = memory_usage((b.run, ))
                v = max(vs)
                # print(k, v)
                results[k] = v
            except Exception as e:
                print("Exception caught in %s\n" % k)
                print(str(e))

        s = Series(results)
        s.sort()
        print(s)

    finally:
        shutil.rmtree(TMP_DIR)
Example #6
0
def Main():
  client = github_helpers.authenticate()
  keywords = raw_input("Please, enter keywords to search repositories: ")
  if keywords is '':
    keywords = 'javascript'
    print 'No keywords provided. It will use the keyword: ' + keywords
  search = client.search_repositories(keywords)
  first_page = search.get_page(0)

  languages = Series(r.language for r in first_page)
  languages = languages.dropna()
  languages.sort()

  percentages = (100.0 * languages.value_counts() / len(languages)).map('{:,.2f} %'.format)

  print 'Languages percentage:'
  print percentages

  # Create plot
  x = [int(r.stargazers_count) for r in first_page]
  y = [int(r.forks) for r in first_page]

  # Add one to every value for logarithmic scale
  x = [val + 1 for val in x]
  y = [val + 1 for val in y]

  area = [100 for r in first_page]
  names = [r.name for r in first_page]
  colors = np.random.rand(len(first_page))
  pl.scatter(x, y, s=area, c=colors, alpha=0.5)
  for i in range(0, len(x)):
    pl.annotate(names[i], (x[i], y[i]), fontsize=2)
  pl.title("All values are with addition of 1 (for the logarithmic scale)")
  pl.xlabel("Stars")
  pl.xscale("log")
  pl.yscale("log")
  pl.ylabel("Forks")
  pl.tight_layout()
  filepath = 'reports/APIs/github'
  if not os.path.isdir(filepath): os.makedirs(filepath)
  filepath += '/search_repositories.png'
  pl.savefig(filepath, figsize=(1020, 1020), dpi=300)
  pl.close()
  print('A chart with high resolution and small font size (to minimize overlaps) was created at ' +
    filepath)
Example #7
0
    def get_same_cluster_articles(self, user_id, label, topn=3):
        log_collection = self.conn.get_collection('article_read_log')

        ls = log_collection.group({'article_id': True}, {
            'label': str(label),
            'user_id': {
                '$ne': user_id
            }
        }, {'count': 0}, 'function(obj, prev) {prev.count++}')

        ls_conv = {'article_id': [], 'count': []}
        for item in ls:
            ls_conv['article_id'].append(item['article_id'])
            ls_conv['count'].append(item['count'])
        s = Series(index=ls_conv['article_id'], data=ls_conv['count'])

        s.sort(ascending=False)  # sorting
        return s.keys()[:topn]
Example #8
0
    def get_same_cluster_articles(self, user_id, label, topn=3):
        log_collection = self.conn.get_collection('article_read_log')

        ls = log_collection.group(
            {'article_id': True},
            {'label' : str(label), 'user_id' : {'$ne' : user_id}},
            {'count' : 0},
            'function(obj, prev) {prev.count++}'
        )

        ls_conv = {'article_id' :[], 'count': []}
        for item in ls:
            ls_conv['article_id'].append(item['article_id'])
            ls_conv['count'].append(item['count'])
        s = Series(index=ls_conv['article_id'], data=ls_conv['count'])

        s.sort(ascending=False) # sorting
        return s.keys()[:topn]
Example #9
0
    def testSeries(self):
        s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6,7,8,9,10,15])
        s.sort()

        # column indexed
        outp = Series(ujson.decode(ujson.encode(s)))
        outp.sort()
        self.assertTrue((s == outp).values.all())

        outp = Series(ujson.decode(ujson.encode(s), numpy=True))
        outp.sort()
        self.assertTrue((s == outp).values.all())

        dec = _clean_dict(ujson.decode(ujson.encode(s, orient="split")))
        outp = Series(**dec)
        self.assertTrue((s == outp).values.all())
        self.assertTrue(s.name == outp.name)

        dec = _clean_dict(ujson.decode(ujson.encode(s, orient="split"),
                          numpy=True))
        outp = Series(**dec)
        self.assertTrue((s == outp).values.all())
        self.assertTrue(s.name == outp.name)

        outp = Series(ujson.decode(ujson.encode(s, orient="records"), numpy=True))
        self.assertTrue((s == outp).values.all())

        outp = Series(ujson.decode(ujson.encode(s, orient="records")))
        self.assertTrue((s == outp).values.all())

        outp = Series(ujson.decode(ujson.encode(s, orient="values"), numpy=True))
        self.assertTrue((s == outp).values.all())

        outp = Series(ujson.decode(ujson.encode(s, orient="values")))
        self.assertTrue((s == outp).values.all())

        outp = Series(ujson.decode(ujson.encode(s, orient="index")))
        outp.sort()
        self.assertTrue((s == outp).values.all())

        outp = Series(ujson.decode(ujson.encode(s, orient="index"), numpy=True))
        outp.sort()
        self.assertTrue((s == outp).values.all())
Example #10
0
def main():
    import shutil
    import tempfile
    import warnings

    from pandas import Series

    from vbench.api import BenchmarkRunner
    from suite import (REPO_PATH, BUILD, DB_PATH, PREPARE,
                       dependencies, benchmarks)

    from memory_profiler import memory_usage

    warnings.filterwarnings('ignore', category=FutureWarning)

    try:
        TMP_DIR = tempfile.mkdtemp()
        runner = BenchmarkRunner(
            benchmarks, REPO_PATH, REPO_PATH, BUILD, DB_PATH,
            TMP_DIR, PREPARE, always_clean=True,
            # run_option='eod', start_date=START_DATE,
            module_dependencies=dependencies)
        results = {}
        for b in runner.benchmarks:
            k = b.name
            try:
                vs = memory_usage((b.run,))
                v = max(vs)
                # print(k, v)
                results[k] = v
            except Exception as e:
                print("Exception caught in %s\n" % k)
                print(str(e))

        s = Series(results)
        s.sort()
        print(s)

    finally:
        shutil.rmtree(TMP_DIR)
    def visualize_tree(tree, feature_names, labelnames, filename):
        """Create tree png using graphviz.

        Args
        ----
        tree -- scikit-learn DecsisionTree.
        feature_names -- list of feature names.
        """
        labels = Series(labelnames.values.ravel()).unique()
        labels.sort()
        labels = map(str, labels)
        # labels = labelnames.unique()
        # print labels
        with open(filename + ".dot", 'w') as f:
            export_graphviz(tree.dt, out_file=f,
                            feature_names=feature_names, class_names=labels)

        command = ["dot", "-Tpdf", filename + ".dot", "-o", filename + ".pdf"]
        try:
            subprocess.check_call(command)
        except:
            exit("Could not run dot, ie graphviz, to "
                 "produce visualization")
    def visualize_tree(tree, feature_names, labelnames, filename):
        """Create tree png using graphviz.

        Args
        ----
        tree -- scikit-learn DecsisionTree.
        feature_names -- list of feature names.
        """
        labels = Series(labelnames.values.ravel()).unique()
        labels.sort()
        labels = map(str, labels)
        # labels = labelnames.unique()
        # print labels
        with open(filename + ".dot", 'w') as f:
            export_graphviz(tree.dt,
                            out_file=f,
                            feature_names=feature_names,
                            class_names=labels)

        command = ["dot", "-Tpdf", filename + ".dot", "-o", filename + ".pdf"]
        try:
            subprocess.check_call(command)
        except:
            exit("Could not run dot, ie graphviz, to " "produce visualization")
Example #13
0
nonalphabet = re.compile("[^a-z]")

with open('words.txt', 'r') as f:
    lines = f.readlines()

trigrams = {}
for line in lines:
    trigram = line.strip().lower()[0:3]
    if len(trigram) >= 3 and not nonalphabet.search(trigram):
        if trigram == "aaa":
            print "line: {0} trigram: {1}".format(line, trigram)
        trigrams.setdefault(trigram, 0)
        trigrams[trigram] += 1

trigram_series = Series(trigrams.values(), index=trigrams.keys())
trigram_series.sort(inplace=True, ascending=True)
print trigram_series
print "quartiles:\n{0}".format(trigram_series.quantile([.25, .50, .75, .99]).to_string())

print "median is: {0}".format(trigram_series.median())
unique_trigrams = []
for trigram, count in trigrams.iteritems():
    if count > trigram_series.quantile(.50):
        unique_trigrams.append(trigram)
    unique_trigrams.append(trigram)

print "saving trigrams"
with open("trigrams.json", "w") as f:
    json.dump(unique_trigrams, f)
print "saved {0} trigrams".format(len(unique_trigrams))
import matplotlib.pyplot as pl
ind = np.arange(len(egs))
fig = pl.figure(1, figsize=(9, 4))
ax = fig.add_subplot(111)
ax.bar(ind,egs)
pl.xticks(ind, ixs, rotation=90)
ax.set_title('Entropy Gain')

# Using a tree for feature importance
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)
z_tmp = Series(clf.feature_importances_, X_all_aug.columns)

z_tmp.sort()
ixs = z_tmp.index

ind = np.arange(len(e_gain))
pl.figure(figsize=(9, 4))
pl.bar(ind,e_gain)
pl.xticks(ind, ixs, rotation=90)

df = DataFrame({'e_gain': e_gain, 'import': z_tmp})
df = df.sort('e_gain', ascending=False)

ind = np.arange(df.shape[0])
pl.figure(figsize=(9, 4))
pl.bar(ind,df.values[:,0],df.values[:,1])
pl.xticks(ind, df.index, rotation=90)
# Not great plot
all_games = pd.read_csv('Data/Box_Scores/' + year + '/Game_List_' + year + '.csv', dtype={'Away_PTS': np.object, 'Home_PTS': np.object})
all_games['Date'] = pd.to_datetime(all_games['Date'])
DateAsOf = max(all_games['Date'].loc[(all_games['Home_PTS']!=' ')])
print ("Date As Of: " + str(DateAsOf.date()))
all_games['Home_PTS'] = pd.to_numeric(all_games['Home_PTS'], errors='coerce')
all_games['Away_PTS'] = pd.to_numeric(all_games['Away_PTS'], errors='coerce')
all_games = all_games.loc[all_games['Away_PTS'].isnull()==False]
all_games = all_games.rename(columns={'Home': 'HOME', 'Home_PTS': 'HOME_PTS', 'Away': 'AWAY', 'Away_PTS': 'AWAY_PTS'})

teams = list(all_games['HOME'].value_counts().keys())
teams.sort()

dates = Series(all_games['Date'].value_counts().keys())
dates = list(dates[dates>start_date])
dates.sort()

def calc_sos(team_df, MOV_df, BaseRate):
    teams = list(MOV_df['Team'])
    SOS = [0 for team in teams]

    init_rate = MOV_df[BaseRate]
    MOV_df = MOV_df.rename(columns={'Team':'VS_Team', BaseRate: 'VS_' + BaseRate})
    for (i, team) in enumerate(teams):
        team_mov = pd.merge(team_df[i], MOV_df[['VS_Team', 'VS_' + BaseRate]], how='left', on='VS_Team')
        SOS[i] = np.mean(team_mov['VS_' + BaseRate])

    MOV_df = MOV_df.rename(columns={'VS_Team':'Team', 'VS_' + BaseRate:BaseRate})
    MOV_df['SOS'] = SOS - np.mean(SOS)
    MOV_df['SRS'] = MOV_df['MOV'] + MOV_df['SOS']
    delta = np.mean(np.absolute(MOV_df['SRS']-init_rate))
Example #16
0
print(s['a'], end='')
print(' which is same as ', end='')
print(s.a)

# A lot of Numpy functions also accept Series as arguments without problems

# Some useful functions
# Series.median(axis=None, skipna=None, level=None, numeric_only=None, **kwargs),
# axis is either 0 or 1. axis = 0 means 'column-wise' and axis = 1 means 'row-wise'
# as of nowm, we know that a series is a 1D array, so only axis = 0 makes-sense
# if you put axis = 1, then you get an error as it doesnt make sense for a 1D array
# Check for yourself.
# skipna = skip NA/null values. If everything in the series is NA, return NA
# level = dont know yet
# numeric_only = currently not implemented
s.sort()
print(s)
print(
    s.median(axis=0)
)  # median means first sort , then middle element. For even no of entries, average of two items in the middle.
#print(s.median(axis=1))

# Comparing whole list with a scalar
l = (
    s > s.median()
)  # this compares every item in the list with the scalar and returns a list of bool
print(l)
print(
    s[l]
)  ## This will print only those items where the corresponding item in list is True. So elegant . Remember it.. :-)
Example #17
0
def get_hist_data(alist,limit=10):
    """make a freq series from list using pandas, limit to most freq 10"""
    genus_freq = Series(alist).value_counts()
    genus_freq.sort(ascending=False) #biggest to smallest
    return genus_freq[:limit]
from pandas import Series
import re

import matplotlib
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize

s1 = Series.from_csv('/Users/cprinz/Developer/MIS375_TwitterProject/fakenews_2-25.csv')
s2 = Series.from_csv('/Users/cprinz/Developer/MIS375_TwitterProject/fakenews_2-26.csv')
s3 = Series.from_csv('/Users/cprinz/Developer/MIS375_TwitterProject/fakenews_2-27.csv')
s4 = Series.from_csv('/Users/cprinz/Developer/MIS375_TwitterProject/fakenews_2-28.csv')

all_tweets = pd.concat([s1,s2,s3,s4])

twitter_handle_re = re.compile(r'@([A-Za-z0-9_]+)')

mention_counts = Series()
for item in all_tweets:
    mentions = twitter_handle_re.findall(item)
    for mention in mentions:
        if mention in mention_counts.keys():
            mention_counts[mention] += 1
        else:
            mention_counts[mention] = 1

mention_counts.sort(ascending = False)
#print mention_counts

mention_counts.plot()
Example #19
0
def summarycalc(projid,date=None,state=0):
    proj=web.ctx.orm.query(Project).filter_by(projid=projid).first()
    if proj is None:
        return None
    symbolsin=proj.symbol
    causesin = proj.cause
    summary_dict={}
    for symbol in symbolsin:
        symbolid=symbol.symbolid
        resultid=str(projid)+'_'+str(symbolid)
        allresult=[]
        totalresult=list(web.ctx.orm.query(Result).filter_by(resultid=resultid).all())
        if totalresult==[]:
            return None
        if state ==0:
            date=totalresult[0].origintime.strftime('%Y-%m-%d')
            #datestr=totalresult[0].origintime.strftime('%Y-%m-%d')
            #date=time.strptime(datestr,'%Y-%m-%d')
            #date=datetime.datetime(*totalresult[0].origintime[:6])
        #allresult=[item for item in totalresult if func.date_format(item.origintime,'%Y-%m-%d')==date]
        allresult=[item for item in totalresult if item.origintime.strftime('%Y-%m-%d')==date]
        if allresult == []:
            return None
        symbolresultdict={}
        cppredict_dict={}
        ncppredict_dict={}
        for cause in causesin:
            causeid=cause.causeid
            if cause.causename in ['cause4','cause5','cause8','cause9']:
                lcausevalue=json.loads(cause.causevalue)
                xdayslist=lcausevalue[cause.causename]
                for xday in xdayslist:
                    cpname='cp for '+xday+' '+gcausedict[cause.causename]
                    ncpname='ncp for '+xday+' '+gcausedict[cause.causename]
                    predicter_cp=web.ctx.orm.query(Predict).filter_by(causeid=causeid,resultname=cpname).first()
                    predicter_ncp=web.ctx.orm.query(Predict).filter_by(causeid=causeid,resultname=ncpname).first()
                    cppredict_dict[cpname]=predicter_cp.coefficient
                    ncppredict_dict[ncpname]=predicter_ncp.coefficient

            else:
                cpname='cp for '+gcausedict[cause.causename]
                ncpname='ncp for '+gcausedict[cause.causename]
                predicter_cp=web.ctx.orm.query(Predict).filter_by(causeid=causeid,resultname=cpname).first()
                predicter_ncp=web.ctx.orm.query(Predict).filter_by(causeid=causeid,resultname=ncpname).first()
                cppredict_dict[cpname]=predicter_cp.coefficient
                ncppredict_dict[ncpname]=predicter_ncp.coefficient
        cppredict_ser=Series(cppredict_dict,index=cppredict_dict.keys())
        ncppredict_ser=Series(ncppredict_dict,index=ncppredict_dict.keys())
        cppredict_ser.sort()
        ncppredict_ser.sort()
        if len(cppredict_ser)<3:
            realcp_ser=cppredict_ser
        else:
            realcp_ser=cppredict_ser[-3:]
        if len(ncppredict_ser)<3:
            realncp_ser=ncppredict_ser
        else:
            realncp_ser=ncppredict_ser[-3:]
        symbol_summary_dict={}
        for idx in realcp_ser.index:
            symbol_summary_dict[idx]=[process(item.resultvalue) for item in allresult if item.resultname==idx][0]
        for idx in realncp_ser.index:
            symbol_summary_dict[idx]=[process(item.resultvalue) for item in allresult if item.resultname==idx][0]
        summary_dict[symbol]=symbol_summary_dict
    summary_df=DataFrame(summary_dict)
    return summary_df.T
Example #20
0
nonalphabet = re.compile("[^a-z]")

with open('words.txt', 'r') as f:
    lines = f.readlines()

trigrams = {}
for line in lines:
    trigram = line.strip().lower()[0:3]
    if len(trigram) >= 3 and not nonalphabet.search(trigram):
        if trigram == "aaa":
            print "line: {0} trigram: {1}".format(line, trigram)
        trigrams.setdefault(trigram, 0)
        trigrams[trigram] += 1

trigram_series = Series(trigrams.values(), index=trigrams.keys())
trigram_series.sort(inplace=True, ascending=True)
print trigram_series
print "quartiles:\n{0}".format(
    trigram_series.quantile([.25, .50, .75, .99]).to_string())

print "median is: {0}".format(trigram_series.median())
unique_trigrams = []
for trigram, count in trigrams.iteritems():
    if count > trigram_series.quantile(.50):
        unique_trigrams.append(trigram)
    unique_trigrams.append(trigram)

print "saving trigrams"
with open("trigrams.json", "w") as f:
    json.dump(unique_trigrams, f)
print "saved {0} trigrams".format(len(unique_trigrams))
Example #21
0
def interrogator(
    corpus,
    search,
    query="any",
    show="w",
    exclude=False,
    excludemode="any",
    searchmode="all",
    dep_type="collapsed-ccprocessed-dependencies",
    case_sensitive=False,
    quicksave=False,
    just_speakers=False,
    preserve_case=False,
    lemmatag=False,
    files_as_subcorpora=False,
    conc=False,
    only_unique=False,
    random=False,
    only_format_match=False,
    multiprocess=False,
    spelling=False,
    regex_nonword_filter=r"[A-Za-z0-9:_]",
    gramsize=2,
    split_contractions=False,
    **kwargs
):
    """interrogate corpus, corpora, subcorpus and file objects

    see corpkit.interrogation.interrogate() for docstring"""
    # store kwargs
    locs = locals()

    from corpkit.interrogation import Interrogation
    from corpkit.process import tregex_engine
    import pandas as pd
    from pandas import DataFrame, Series
    from collections import Counter
    from corpkit.other import as_regex
    from corpkit.process import get_deps
    from time import localtime, strftime

    thetime = strftime("%H:%M:%S", localtime())
    from corpkit.textprogressbar import TextProgressBar
    from corpkit.process import animator
    from corpkit.dictionaries.word_transforms import wordlist, taglemma

    # find out if using gui
    root = kwargs.get("root")
    note = kwargs.get("note")

    # convert path to corpus object
    if type(corpus) == str:
        from corpkit.corpus import Corpus

        corpus = Corpus(corpus)

    # figure out how the user has entered the query and normalise
    from corpkit.process import searchfixer

    search, search_iterable = searchfixer(search, query)

    # for better printing of query, esp during multiprocess
    # can remove if multiprocess printing improved
    if len(search.keys()) == 1:
        query = search.values()[0]

    if "l" in show and search.get("t"):
        from nltk.stem.wordnet import WordNetLemmatizer

        lmtzr = WordNetLemmatizer()

    if type(show) == str:
        show = [show]

    def is_multiquery(corpus, search, query, just_speakers):
        """determine if multiprocessing is needed
        do some retyping if need be as well"""
        im = False
        from collections import OrderedDict

        if hasattr(corpus, "__iter__"):
            im = True
        # so we can do search = 't', query = ['NP', 'VP']:
        if type(query) == list:
            if query != search.values()[0] or len(search.keys()) > 1:
                query = {c.title(): c for c in query}
        if type(query) == dict or type(query) == OrderedDict:
            im = True
        if just_speakers:
            if just_speakers == "each":
                im = True
                just_speakers = ["each"]
            if just_speakers == ["each"]:
                im = True
            if type(just_speakers) == str:
                im = False
                just_speakers = [just_speakers]
            if type(just_speakers) == list:
                if len(just_speakers) > 1:
                    im = True
        if type(search) == dict:
            if all(type(i) == dict for i in search.values()):
                im = True
        return im, corpus, search, query, just_speakers

    def slow_tregex(sents, **dummy_args):
        """do the speaker-specific version of tregex queries"""
        import os
        from corpkit.process import tregex_engine

        # first, put the relevant trees into temp file
        if kwargs.get("outname"):
            to_open = "tmp-%s.txt" % kwargs["outname"]
        else:
            to_open = "tmp.txt"
        to_write = "\n".join([sent._parse_string.strip() for sent in sents if sent.parse_string is not None])
        to_write.encode("utf-8", errors="ignore")
        with open(to_open, "w") as fo:
            fo.write(to_write)
        q = search.values()[0]
        res = tregex_engine(
            query=q, options=["-o", "-%s" % translated_option], corpus=to_open, root=root, preserve_case=True
        )
        if root:
            root.update()
        os.remove(to_open)
        if countmode:
            return len(res)
        else:
            return res

    def get_stats(sents, **dummy_args):
        """get a bunch of frequencies on interpersonal phenomena"""
        import os
        import re
        from collections import Counter

        statsmode_results = Counter()
        # first, put the relevant trees into temp file
        if kwargs.get("outname"):
            to_open = "tmp-%s.txt" % kwargs["outname"]
        else:
            to_open = "tmp.txt"
        with open(to_open, "w") as fo:
            for sent in sents:
                statsmode_results["Sentences"] += 1
                sts = sent.parse_string.rstrip()
                encd = sts.encode("utf-8", errors="ignore") + "\n"
                fo.write(encd)
                deps = get_deps(sent, dep_type)
                numpass = len([x for x in deps.links if x.type.endswith("pass")])
                statsmode_results["Passives"] += numpass
                statsmode_results["Tokens"] += len(sent.tokens)
                words = [w.word for w in sent.tokens if w.word.isalnum()]
                statsmode_results["Words"] += len(words)
                statsmode_results["Characters"] += len("".join(words))

        # count moods via trees          (/\?/ !< __)
        from dictionaries.process_types import processes
        from corpkit.other import as_regex

        tregex_qs = {
            "Imperative": r"ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/",
            "Open interrogative": r"ROOT < SBARQ <<- (/\?/ !< __)",
            "Closed interrogative": r"ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))",
            "Unmodalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))",
            "Modalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))",
            "Open class words": r"/^(NN|JJ|VB|RB)/ < __",
            "Closed class words": r"__ !< __ !> /^(NN|JJ|VB|RB)/",
            "Clauses": r"/^S/ < __",
            "Interrogative": r"ROOT << (/\?/ !< __)",
            "Mental processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.mental, boundaries="w"),
            "Verbal processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.verbal, boundaries="w"),
            "Relational processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)"
            % as_regex(processes.relational, boundaries="w"),
        }

        for name, q in sorted(tregex_qs.items()):
            res = tregex_engine(query=q, options=["-o", "-C"], corpus=to_open, root=root)
            statsmode_results[name] += int(res)
            global numdone
            numdone += 1
            if root:
                root.update()
            else:
                tot_string = str(numdone + 1) + "/" + str(total_files)
                if kwargs.get("outname"):
                    tot_string = "%s: %s" % (kwargs["outname"], tot_string)
                animator(p, numdone, tot_string, **par_args)
            if kwargs.get("note", False):
                kwargs["note"].progvar.set((numdone * 100.0 / total_files / denom) + startnum)
        os.remove(to_open)
        return statsmode_results

    def make_conc_lines_from_whole_mid(wholes, middle_column_result, speakr=False):
        if speakr is False:
            speakr = ""
        conc_lines = []
        # remove duplicates from results
        unique_wholes = []
        unique_middle_column_result = []
        duplicates = []
        for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)):
            if "-join-".join([f, whole, mid]) not in duplicates:
                duplicates.append("-join-".join([f, whole, mid]))
                unique_wholes.append([f, whole])
                unique_middle_column_result.append(mid)

        # split into start, middle and end, dealing with multiple occurrences
        for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)):
            reg = re.compile(r"([^a-zA-Z0-9-]|^)(" + re.escape(mid) + r")([^a-zA-Z0-9-]|$)", re.IGNORECASE | re.UNICODE)
            offsets = [(m.start(), m.end()) for m in re.finditer(reg, whole)]
            for offstart, offend in offsets:
                start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip()
                conc_lines.append([os.path.basename(f), speakr, start, middle, end])
        return conc_lines

    def uniquify(conc_lines):
        from collections import OrderedDict

        unique_lines = []
        checking = []
        for index, (f, speakr, start, middle, end) in enumerate(conc_lines):
            joined = " ".join([speakr, start, "MIDDLEHERE:", middle, ":MIDDLEHERE", end])
            if joined not in checking:
                unique_lines.append(conc_lines[index])
            checking.append(joined)
        return unique_lines

    def lemmatiser(list_of_words, tag):
        """take a list of unicode words and a tag and return a lemmatised list."""
        output = []
        for word in list_of_words:
            if translated_option.startswith("u"):
                if word.lower() in taglemma.keys():
                    word = taglemma[word.lower()]
                else:
                    if word == "x":
                        word = "Other"
            # only use wordnet lemmatiser when appropriate
            else:
                if word in wordlist:
                    word = wordlist[word]
                word = lmtzr.lemmatize(word, tag)
            output.append(word)
        return output

    def gettag(query, lemmatag=False):
        """
        Find tag for WordNet lemmatisation
        """
        import re

        tagdict = {"N": "n", "A": "a", "V": "v", "A": "r", "None": False, "": False, "Off": False}

        if lemmatag is False:
            tag = "n"  # same default as wordnet
            # attempt to find tag from tregex query
            tagfinder = re.compile(r"^[^A-Za-z]*([A-Za-z]*)")
            tagchecker = re.compile(r"^[A-Z]{1,4}$")
            qr = query.replace(r"\w", "").replace(r"\s", "").replace(r"\b", "")
            treebank_tag = re.findall(tagfinder, qr)
            if re.match(tagchecker, treebank_tag[0]):
                tag = tagdict.get(treebank_tag[0], "n")
        elif lemmatag:
            tag = lemmatag
        return tag

    def format_tregex(results):
        """format tregex by show list"""
        if countmode:
            return results
        import re

        done = []
        if "l" in show or "pl" in show:
            lemmata = lemmatiser(results, gettag(search.get("t"), lemmatag))
        else:
            lemmata = [None for i in results]
        for word, lemma in zip(results, lemmata):
            bits = []
            if exclude and exclude.get("w"):
                if len(exclude.keys()) == 1 or excludemode == "any":
                    if re.search(exclude.get("w"), word):
                        continue
                if len(exclude.keys()) == 1 or excludemode == "any":
                    if re.search(exclude.get("l"), lemma):
                        continue
                if len(exclude.keys()) == 1 or excludemode == "any":
                    if re.search(exclude.get("p"), word):
                        continue
                if len(exclude.keys()) == 1 or excludemode == "any":
                    if re.search(exclude.get("pl"), lemma):
                        continue
            if exclude and excludemode == "all":
                num_to_cause_exclude = len(exclude.keys())
                current_num = 0
                if exclude.get("w"):
                    if re.search(exclude.get("w"), word):
                        current_num += 1
                if exclude.get("l"):
                    if re.search(exclude.get("l"), lemma):
                        current_num += 1
                if exclude.get("p"):
                    if re.search(exclude.get("p"), word):
                        current_num += 1
                if exclude.get("pl"):
                    if re.search(exclude.get("pl"), lemma):
                        current_num += 1
                if current_num == num_to_cause_exclude:
                    continue

            for i in show:
                if i == "t":
                    bits.append(word)
                if i == "l":
                    bits.append(lemma)
                elif i == "w":
                    bits.append(word)
                elif i == "p":
                    bits.append(word)
                elif i == "pl":
                    bits.append(lemma)
            joined = "/".join(bits)
            done.append(joined)
        return done

    def tok_by_list(pattern, list_of_toks, concordancing=False, **kwargs):
        """search for regex in plaintext corpora"""
        import re

        if type(pattern) == str:
            pattern = [pattern]
        if not case_sensitive:
            pattern = [p.lower() for p in pattern]
        if not concordancing:
            if case_sensitive:
                matches = [m for m in list_of_toks if m in pattern]
            else:
                matches = [m for m in list_of_toks if m.lower() in pattern]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if token in pattern:
                    match = [" ".join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(token)
                    match.append(" ".join([t for t in unsplitter(list_of_toks[index + 1 :])])[:140])
                    matches.append(match)
        if countmode:
            return len(matches)
        else:
            return matches

    def unsplitter(lst):
        """unsplit contractions and apostophes from tokenised text"""
        if split_contractions:
            return lst
        unsplit = []
        for index, t in enumerate(lst):
            if index == 0 or index == len(lst) - 1:
                unsplit.append(t)
                continue
            if "'" in t and not t.endswith("'"):
                rejoined = "".join([lst[index - 1], t])
                unsplit.append(rejoined)
            else:
                if not "'" in lst[index + 1]:
                    unsplit.append(t)
        return unsplit

    def tok_ngrams(pattern, list_of_toks, concordancing=False, split_contractions=True):
        from collections import Counter
        import re

        ngrams = Counter()
        result = []
        # if it's not a compiled regex
        list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)]
        if pattern.lower() == "any":
            pattern = r".*"

        if not split_contractions:
            list_of_toks = unsplitter(list_of_toks)

            # list_of_toks = [x for x in list_of_toks if "'" not in x]
        for index, w in enumerate(list_of_toks):
            try:
                the_gram = [list_of_toks[index + x] for x in range(gramsize)]
                if not any(re.search(pattern, x) for x in the_gram):
                    continue
                ngrams[" ".join(the_gram)] += 1
            except IndexError:
                pass

        # turn counter into list of results
        for k, v in ngrams.items():
            if v > 1:
                for i in range(v):
                    result.append(k)
        if countmode:
            return len(result)
        else:
            return result

    def compiler(pattern):
        """compile regex or fail gracefully"""
        import re

        try:
            if case_sensitive:
                comped = re.compile(pattern)
            else:
                comped = re.compile(pattern, re.IGNORECASE)
            return comped
        except:
            import traceback
            import sys
            from time import localtime, strftime

            exc_type, exc_value, exc_traceback = sys.exc_info()
            lst = traceback.format_exception(exc_type, exc_value, exc_traceback)
            error_message = lst[-1]
            thetime = strftime("%H:%M:%S", localtime())
            print "%s: Query %s" % (thetime, error_message)
            if root:
                return "Bad query"
            else:
                raise ValueError("%s: Query %s" % (thetime, error_message))

    def tok_by_reg(pattern, list_of_toks, concordancing=False, **kwargs):
        """search for regex in plaintext corpora"""
        import re

        comped = compiler(pattern)
        if comped == "Bad query":
            return "Bad query"
        if not concordancing:
            matches = [m for m in list_of_toks if re.search(comped, m)]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if re.search(comped, token):
                    match = [" ".join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(re.search(comped, token).group(0))
                    match.append(" ".join([t for t in unsplitter(list_of_toks[index + 1 :])])[:140])
                    matches.append(match)
        if countmode:
            return len(matches)
        else:
            return matches

    def plaintext_regex_search(pattern, plaintext_data, concordancing=False, **kwargs):
        """search for regex in plaintext corpora

        it searches over lines, so the user needs to be careful.
        """
        import re

        if concordancing:
            pattern = r"(.{,140})\b(" + pattern + r")\b(.{,140})"
        compiled_pattern = compiler(pattern)
        if compiled_pattern == "Bad query":
            return "Bad query"
        matches = re.findall(compiled_pattern, plaintext_data)
        if concordancing:
            matches = [list(m) for m in matches]
        if not concordancing:
            for index, i in enumerate(matches):
                if type(i) == tuple:
                    matches[index] = i[0]
        if countmode:
            return len(matches)
        else:
            return matches

    def correct_spelling(a_string):
        if not spelling:
            return a_string
        from dictionaries.word_transforms import usa_convert

        if spelling.lower() == "uk":
            usa_convert = {v: k for k, v in usa_convert.items()}
        spell_out = []
        bits = a_string.split("/")
        for index, i in enumerate(bits):
            converted = usa_convert.get(i.lower(), i)
            if i.islower() or preserve_case is False:
                converted = converted.lower()
            elif i.isupper() and preserve_case:
                converted = converted.upper()
            elif i.istitle() and preserve_case:
                converted = converted.title()
            bits[index] = converted
        r = "/".join(bits)
        return r

    def plaintext_simple_search(pattern, plaintext_data, concordancing=False, **kwargs):
        """search for tokens in plaintext corpora"""
        import re

        result = []
        if type(pattern) == str:
            pattern = [pattern]
        for p in pattern:
            if concordancing:
                pat = r"(.{0,140})\b(" + re.escape(p) + r")\b(.{0,140})"
            pat = compiler(pat)
            if pat == "Bad query":
                return "Bad query"
            matches = re.findall(pat, plaintext_data)
            if concordancing:
                matches = [list(m) for m in matches]
                for i in matches:
                    result.append(i)
            else:
                for m in range(len(matches)):
                    result.append(p)
        return result

    # do multiprocessing if need be
    im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers)

    locs["search"] = search
    locs["query"] = query
    locs["just_speakers"] = just_speakers
    locs["corpus"] = corpus
    locs["multiprocess"] = multiprocess

    if im:
        from corpkit.multiprocess import pmultiquery

        return pmultiquery(**locs)

    datatype = corpus.datatype
    singlefile = corpus.singlefile

    # store all results in here
    results = {}
    # check if just counting
    countmode = "c" in show
    # where we are at in interrogation
    current_iter = 0

    # multiprocessing progress bar
    denom = kwargs.get("denominator", 1)
    startnum = kwargs.get("startnum", 0)

    ############################################
    # Determine the search function to be used #
    ############################################

    # simple tregex is tregex over whole dirs
    simple_tregex_mode = False
    statsmode = False
    if not just_speakers and "t" in search.keys():
        simple_tregex_mode = True
    else:
        if corpus.datatype == "plaintext":
            if search.get("n"):
                raise NotImplementedError("Use a tokenised corpus for n-gramming.")
                # searcher = plaintext_ngram
                optiontext = "n-grams via plaintext"
            if search.get("w"):
                if kwargs.get("regex", True):
                    searcher = plaintext_regex_search
                else:
                    searcher = plaintext_simple_search
                optiontext = "Searching plaintext"

        elif corpus.datatype == "tokens":
            if search.get("n"):
                searcher = tok_ngrams
                optiontext = "n-grams via tokens"
            elif search.get("w"):
                if kwargs.get("regex", True):
                    searcher = tok_by_reg
                else:
                    searcher = tok_by_list
                if type(search.get("w")) == list:
                    searcher = tok_by_list
                optiontext = "Searching tokens"
        only_parse = ["r", "d", "g", "dl", "gl", "df", "gf", "dp", "gp", "f"]
        if corpus.datatype != "parse" and any(i in only_parse for i in search.keys()):
            raise ValueError(
                'Need parsed corpus to search with "%s" option(s).'
                % ", ".join([i for i in search.keys() if i in only_parse])
            )

        elif corpus.datatype == "parse":
            if search.get("t"):
                searcher = slow_tregex
            elif search.get("s"):
                searcher = get_stats
                statsmode = True
                optiontext = "General statistics"
                global numdone
                numdone = 0
            else:
                from corpkit.depsearch import dep_searcher

                searcher = dep_searcher
                optiontext = "Dependency querying"

    ############################################
    #      Set some Tregex-related values      #
    ############################################

    if search.get("t"):
        query = search.get("t")

        # check the query
        q = tregex_engine(corpus=False, query=search.get("t"), options=["-t"], check_query=True, root=root)
        if query is False:
            if root:
                return "Bad query"
            else:
                return

        optiontext = "Searching parse trees"
        if "p" in show or "pl" in show:
            translated_option = "u"
            if type(search["t"]) == list:
                search["t"] = r"__ < (/%s/ !< __)" % as_regex(
                    search["t"], boundaries="line", case_sensitive=case_sensitive
                )
            if search["t"] == "any":
                search["t"] = r"__ < (/.?[A-Za-z0-9].?/ !< __)"
        elif "t" in show:
            translated_option = "o"
            if type(search["t"]) == list:
                search["t"] = r"__ < (/%s/ !< __)" % as_regex(
                    search["t"], boundaries="line", case_sensitive=case_sensitive
                )
            if search["t"] == "any":
                search["t"] = r"__ < (/.?[A-Za-z0-9].?/ !< __)"
        elif "w" in show:
            translated_option = "t"
            if type(search["t"]) == list:
                search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive)
            if search["t"] == "any":
                search["t"] = r"/.?[A-Za-z0-9].?/ !< __"
        elif "c" in show:
            count_results = {}
            only_count = True
            translated_option = "C"
            if type(search["t"]) == list:
                search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive)
            if search["t"] == "any":
                search["t"] = r"/.?[A-Za-z0-9].?/ !< __"
        elif "l" in show:
            translated_option = "t"
            if type(search["t"]) == list:
                search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive)
            if search["t"] == "any":
                search["t"] = r"/.?[A-Za-z0-9].?/ !< __"

        query = search["t"]

    ############################################
    # Make iterable for corpus/subcorpus/file  #
    ############################################

    if corpus.singlefile:
        to_iterate_over = {(corpus.name, corpus.path): [corpus]}
    elif not corpus.subcorpora:
        to_iterate_over = {(corpus.name, corpus.path): corpus.files}
    else:
        to_iterate_over = {}
        for k, v in sorted(corpus.structure.items()):
            to_iterate_over[(k.name, k.path)] = v
    if files_as_subcorpora:
        to_iterate_over = {}
        for f in corpus.files:
            to_iterate_over[(f.name, f.path)] = [f]

    ############################################
    #           Print welcome message          #
    ############################################

    if conc:
        message = "Concordancing"
    else:
        message = "Interrogating"
    if kwargs.get("printstatus", True):
        thetime = strftime("%H:%M:%S", localtime())

        sformat = "\n                 ".join(["%s: %s" % (k.rjust(3), v) for k, v in search.items()])
        if search == {"s": r".*"}:
            sformat = "features"
        welcome = "\n%s: %s %s ...\n          %s\n          Query: %s\n" % (
            thetime,
            message,
            corpus.name,
            optiontext,
            sformat,
        )
        print welcome

    ############################################
    #           Make progress bar              #
    ############################################

    if simple_tregex_mode:
        total_files = len(to_iterate_over.keys())
    else:
        if search.get("s"):
            total_files = sum([len(x) for x in to_iterate_over.values()]) * 12
        else:
            total_files = sum([len(x) for x in to_iterate_over.values()])

    par_args = {"printstatus": kwargs.get("printstatus", True), "root": root, "note": note, "length": total_files}

    term = None
    if kwargs.get("paralleling", None) is not None:
        from blessings import Terminal

        term = Terminal()
        par_args["terminal"] = term
        par_args["linenum"] = kwargs.get("paralleling")

    outn = kwargs.get("outname", "")
    if outn:
        outn = outn + ": "
    tstr = "%s%d/%d" % (outn, current_iter, total_files)
    p = animator(None, None, init=True, tot_string=tstr, **par_args)
    tstr = "%s%d/%d" % (outn, current_iter + 1, total_files)
    animator(p, current_iter, tstr, **par_args)

    ############################################
    # Iterate over data, doing interrogations  #
    ############################################

    for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()):

        if countmode or conc:
            results[subcorpus_name] = []
        else:
            results[subcorpus_name] = Counter()

        # tregex over subcorpora, not files
        if simple_tregex_mode:

            op = ["-o", "-" + translated_option]
            result = tregex_engine(
                query=search["t"], options=op, corpus=subcorpus_path, root=root, preserve_case=preserve_case
            )

            if countmode:
                results[subcorpus_name].append(result)
                continue

            result = Counter(format_tregex(result))

            if conc:
                op.append("-w")
                whole_result = tregex_engine(
                    query=search["t"], options=op, corpus=subcorpus_path, root=root, preserve_case=preserve_case
                )

                if not only_format_match:
                    whole_result = format_tregex(whole_result)

                result = make_conc_lines_from_whole_mid(whole_result, result, speakr=False)

                if spelling:
                    for index, line in enumerate(result):
                        result[index] = [correct_spelling(b) for b in line]

            results[subcorpus_name] += result

            current_iter += 1
            if kwargs.get("paralleling", None) is not None:
                tstr = "%s%d/%d" % (outn, current_iter + 2, total_files)
            else:
                tstr = "%s%d/%d" % (outn, current_iter + 1, total_files)
            animator(p, current_iter, tstr, **par_args)

        # dependencies, plaintext, tokens or slow_tregex
        else:
            for f in files:

                if corpus.datatype == "parse":
                    with open(f.path, "r") as data:
                        data = data.read()
                        from corenlp_xml.document import Document

                        try:
                            corenlp_xml = Document(data)
                        except:
                            print "Could not read file: %s" % f.path
                            continue
                        if just_speakers:
                            sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers]
                            if not sents:
                                continue
                        else:
                            sents = corenlp_xml.sentences

                        res = searcher(
                            sents,
                            search=search,
                            show=show,
                            dep_type=dep_type,
                            exclude=exclude,
                            excludemode=excludemode,
                            searchmode=searchmode,
                            lemmatise=False,
                            case_sensitive=case_sensitive,
                            concordancing=conc,
                            only_format_match=only_format_match,
                        )

                        if res == "Bad query":
                            return "Bad query"

                        if searcher == slow_tregex and not countmode:
                            res = format_tregex(res)

                elif corpus.datatype == "tokens":
                    import pickle

                    with open(f.path, "rb") as fo:
                        data = pickle.load(fo)
                    res = searcher(search.values()[0], data, split_contractions=split_contractions, concordancing=conc)
                    if conc:
                        for index, line in enumerate(res):
                            line.insert(0, "")

                elif corpus.datatype == "plaintext":
                    with open(f.path, "rb") as data:
                        data = data.read()
                        data = unicode(data, errors="ignore")
                        res = searcher(search.values()[0], data, concordancing=conc)
                        if conc:
                            for index, line in enumerate(res):
                                line.insert(0, "")

                if countmode:
                    results[subcorpus_name] += res
                    continue

                # add filename and do lowercasing for conc
                if conc:
                    for index, line in enumerate(res):
                        line.insert(0, f.name)
                        if not preserve_case:
                            line = [b.lower() for b in line]
                        if spelling:
                            line = [correct_spelling(b) for b in line]
                        results[subcorpus_name] += [line]

                # do lowercasing and spelling
                else:
                    if not preserve_case:
                        res = [r.lower() for r in res]
                    if spelling:
                        res = [correct_spelling(r) for r in res]
                    results[subcorpus_name] += Counter(res)

                if not statsmode:
                    current_iter += 1
                    if kwargs.get("paralleling", None) is not None:
                        tstr = "%s%d/%d" % (outn, current_iter + 2, total_files)
                    else:
                        tstr = "%s%d/%d" % (outn, current_iter + 1, total_files)

    # delete temp file if there
    import os

    if os.path.isfile("tmp.txt"):
        os.remove("tmp.txt")

    ############################################
    #     Get concordances into DataFrame      #
    ############################################

    if conc:
        all_conc_lines = []
        for sc_name, resu in sorted(results.items()):

            if only_unique:
                unique_results = uniquify(resu)
            else:
                unique_results = resu
            # make into series
            pindex = "c f s l m r".encode("utf-8").split()
            for fname, spkr, start, word, end in unique_results:
                spkr = unicode(spkr, errors="ignore")
                fname = os.path.basename(fname)

                # the use of ascii here makes sure the string formats ok, but will also screw over
                # anyone doing non-english work. so, change to utf-8, then fix errors as they come
                # in the corpkit-gui "add_conc_lines_to_window" function
                all_conc_lines.append(
                    Series(
                        [
                            sc_name.encode("ascii", errors="ignore"),
                            fname.encode("ascii", errors="ignore"),
                            spkr.encode("ascii", errors="ignore"),
                            start.encode("ascii", errors="ignore"),
                            word.encode("ascii", errors="ignore"),
                            end.encode("ascii", errors="ignore"),
                        ],
                        index=pindex,
                    )
                )

        # randomise results...
        if random:
            from random import shuffle

            shuffle(all_conc_lines)

        df = pd.concat(all_conc_lines, axis=1).T

        # not doing anything yet --- this is for multimodal concordancing
        add_links = False
        if not add_links:
            df.columns = ["c", "f", "s", "l", "m", "r"]
        else:
            df.columns = ["c", "f", "s", "l", "m", "r", "link"]

        if all(x == "" for x in list(df["s"].values)):
            df.drop("s", axis=1, inplace=True)

        if kwargs.get("note"):
            kwargs["note"].progvar.set(100)

        if kwargs.get("printstatus", True):
            thetime = strftime("%H:%M:%S", localtime())
            finalstring = "\n\n%s: Concordancing finished! %d matches.\n" % (thetime, len(df.index))
            print finalstring

        from corpkit.interrogation import Concordance

        output = Concordance(df)
        output.query = locs
        if quicksave:
            interro.save()
        return output

    ############################################
    #     Get interrogation into DataFrame     #
    ############################################

    else:
        if countmode:
            df = Series({k: sum(v) for k, v in sorted(results.items())})
            tot = df.sum()
        else:
            the_big_dict = {}
            unique_results = set([item for sublist in results.values() for item in sublist])
            for word in unique_results:
                the_big_dict[word] = [subcorp_result[word] for subcorp_result in sorted(results.values())]
            # turn master dict into dataframe, sorted
            df = DataFrame(the_big_dict, index=sorted(results.keys()))

            numentries = len(df.columns)
            tot = df.sum(axis=1)
            total_total = df.sum().sum()

        ############################################
        # Format, output as Interrogation object   #
        ############################################

        if not countmode:
            if not corpus.subcorpora or singlefile:
                if not files_as_subcorpora:
                    if not kwargs.get("df1_always_df"):
                        df = Series(df.ix[0])
                        df.sort(ascending=False)
                        tot = df.sum()
                        numentries = len(df.index)
                        total_total = tot

        # sort by total
        if type(df) == pd.core.frame.DataFrame:
            if not df.empty:
                df.ix["Total-tmp"] = df.sum()
                the_tot = df.ix["Total-tmp"]
                df = df[the_tot.argsort()[::-1]]
                df = df.drop("Total-tmp", axis=0)

        # format final string
        if kwargs.get("printstatus", True):
            thetime = strftime("%H:%M:%S", localtime())
            finalstring = "\n\n%s: Interrogation finished!" % thetime
            if countmode:
                finalstring += " %d matches." % tot
            else:
                finalstring += " %d unique results, %d total occurrences." % (numentries, total_total)
            print finalstring

        interro = Interrogation(results=df, totals=tot, query=locs)

        if quicksave:
            interro.save()

        return interro
Example #22
0
def interrogator(corpus, 
            search, 
            query = 'any', 
            show = 'w',
            exclude = False,
            excludemode = 'any',
            searchmode = 'all',
            dep_type = 'collapsed-ccprocessed-dependencies',
            case_sensitive = False,
            quicksave = False,
            just_speakers = False,
            preserve_case = False,
            lemmatag = False,
            files_as_subcorpora = False,
            conc = False,
            only_unique = False,
            random = False,
            only_format_match = False,
            multiprocess = False,
            spelling = False,
            regex_nonword_filter = r'[A-Za-z0-9:_]',
            gramsize = 2,
            split_contractions = False,
            **kwargs):
    """interrogate corpus, corpora, subcorpus and file objects

    see corpkit.interrogation.interrogate() for docstring"""
    # store kwargs
    locs = locals()

    from interrogation import Interrogation
    from process import tregex_engine
    import pandas as pd
    from pandas import DataFrame, Series
    from collections import Counter
    from other import as_regex
    from process import get_deps
    from time import localtime, strftime
    thetime = strftime("%H:%M:%S", localtime())
    from textprogressbar import TextProgressBar
    from process import animator
    from dictionaries.word_transforms import wordlist, taglemma
    import corenlp_xml
    import codecs

    # find out if using gui
    root = kwargs.get('root')
    note = kwargs.get('note')

    # convert path to corpus object
    if type(corpus) == str:
        from corpus import Corpus
        corpus = Corpus(corpus)

    # figure out how the user has entered the query and normalise
    from process import searchfixer
    search, search_iterable = searchfixer(search, query)
    
    # for better printing of query, esp during multiprocess
    # can remove if multiprocess printing improved
    if len(list(search.keys())) == 1:
        query = list(search.values())[0]

    if 'l' in show and search.get('t'):
        from nltk.stem.wordnet import WordNetLemmatizer
        lmtzr=WordNetLemmatizer()

    if type(show) == str:
        show = [show]

    def is_multiquery(corpus, search, query, just_speakers):
        """determine if multiprocessing is needed
        do some retyping if need be as well"""
        im = False
        from collections import OrderedDict
        if hasattr(corpus, '__iter__'):
            im = True
        # so we can do search = 't', query = ['NP', 'VP']:
        if type(query) == list:
            if query != list(search.values())[0] or len(list(search.keys())) > 1:
                query = {c.title(): c for c in query}
        if type(query) == dict or type(query) == OrderedDict:
            im = True
        if just_speakers:
            if just_speakers == 'each':
                im = True
                just_speakers = ['each']
            if just_speakers == ['each']:
                im = True
            if type(just_speakers) == str:
                im = False
                just_speakers = [just_speakers]
            if type(just_speakers) == list:
                if len(just_speakers) > 1:
                    im = True
        if type(search) == dict:
            if all(type(i) == dict for i in list(search.values())):
                im = True
        return im, corpus, search, query, just_speakers

    def slow_tregex(sents, **dummy_args):
        """do the speaker-specific version of tregex queries"""
        import os
        from process import tregex_engine
        # first, put the relevant trees into temp file
        if kwargs.get('outname'):
            to_open = 'tmp-%s.txt' % kwargs['outname']
        else:
            to_open = 'tmp.txt'
        to_write = '\n'.join([sent._parse_string.strip() for sent in sents \
                              if sent.parse_string is not None])
        to_write.encode('utf-8', errors = 'ignore')
        with open(to_open, "w") as fo:
            fo.write(to_write)
        q = list(search.values())[0]
        res = tregex_engine(query = q, 
                            options = ['-o', '-%s' % translated_option], 
                            corpus = to_open,
                            root = root,
                            preserve_case = True)
        if root:
            root.update()
        os.remove(to_open)
        if countmode:
            return(len(res))
        else:
            return res

    def get_stats(sents, **dummy_args):
        """get a bunch of frequencies on interpersonal phenomena"""
        import os
        import re
        from collections import Counter
        statsmode_results = Counter()  
        # first, put the relevant trees into temp file
        if kwargs.get('outname'):
            to_open = 'tmp-%s.txt' % kwargs['outname']
        else:
            to_open = 'tmp.txt'
        with open(to_open, "w") as fo:
            for sent in sents:
                statsmode_results['Sentences'] += 1
                sts = sent.parse_string.rstrip()
                encd = sts.encode('utf-8', errors = 'ignore') + '\n'
                fo.write(encd)
                deps = get_deps(sent, dep_type)
                numpass = len([x for x in deps.links if x.type.endswith('pass')])
                statsmode_results['Passives'] += numpass
                statsmode_results['Tokens'] += len(sent.tokens)
                words = [w.word for w in sent.tokens if w.word.isalnum()]
                statsmode_results['Words'] += len(words)
                statsmode_results['Characters'] += len(''.join(words))

        # count moods via trees          (/\?/ !< __)
        from dictionaries.process_types import processes
        from other import as_regex
        tregex_qs = {'Imperative': r'ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/',
                     'Open interrogative': r'ROOT < SBARQ <<- (/\?/ !< __)', 
                     'Closed interrogative': r'ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))',
                     'Unmodalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))',
                     'Modalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))',
                     'Open class words': r'/^(NN|JJ|VB|RB)/ < __',
                     'Closed class words': r'__ !< __ !> /^(NN|JJ|VB|RB)/',
                     'Clauses': r'/^S/ < __',
                     'Interrogative': r'ROOT << (/\?/ !< __)',
                     'Mental processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.mental, boundaries = 'w'),
                     'Verbal processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.verbal, boundaries = 'w'),
                     'Relational processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.relational, boundaries = 'w')
                     }

        for name, q in sorted(tregex_qs.items()):
            res = tregex_engine(query = q, 
                  options = ['-o', '-C'], 
                  corpus = to_open,  
                  root = root)
            statsmode_results[name] += int(res)
            global numdone
            numdone += 1
            if root:
                root.update()
            else:
                tot_string = str(numdone + 1) + '/' + str(total_files)
                if kwargs.get('outname'):
                    tot_string = '%s: %s' % (kwargs['outname'], tot_string)
                animator(p, numdone, tot_string, **par_args)
            if kwargs.get('note', False):
                kwargs['note'].progvar.set((numdone * 100.0 / total_files / denom) + startnum)
        os.remove(to_open)
        return statsmode_results

    def make_conc_lines_from_whole_mid(wholes, middle_column_result, 
                                       speakr = False):
        if speakr is False:
            speakr = ''
        conc_lines = []
        # remove duplicates from results
        unique_wholes = []
        unique_middle_column_result = []
        duplicates = []
        for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)):
            if '-join-'.join([f, whole, mid]) not in duplicates:
                duplicates.append('-join-'.join([f, whole, mid]))
                unique_wholes.append([f, whole])
                unique_middle_column_result.append(mid)

        # split into start, middle and end, dealing with multiple occurrences
        for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)):
            reg = re.compile(r'([^a-zA-Z0-9-]|^)(' + re.escape(mid) + r')([^a-zA-Z0-9-]|$)', re.IGNORECASE | re.UNICODE)
            offsets = [(m.start(), m.end()) for m in re.finditer(reg,whole)]
            for offstart, offend in offsets:              
                start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip()
                conc_lines.append([os.path.basename(f), speakr, start, middle, end])
        return conc_lines

    def uniquify(conc_lines):
        from collections import OrderedDict
        unique_lines = []
        checking = []
        for index, (f, speakr, start, middle, end) in enumerate(conc_lines):
            joined = ' '.join([speakr, start, 'MIDDLEHERE:', middle, ':MIDDLEHERE', end])
            if joined not in checking:
                unique_lines.append(conc_lines[index])
            checking.append(joined)
        return unique_lines

    def lemmatiser(list_of_words, tag):
        """take a list of unicode words and a tag and return a lemmatised list."""
        output = []
        for word in list_of_words:
            if translated_option.startswith('u'):
                if word.lower() in list(taglemma.keys()):
                    word = taglemma[word.lower()]
                else:
                    if word == 'x':
                        word = 'Other'
            # only use wordnet lemmatiser when appropriate
            else:
                if word in wordlist:
                    word = wordlist[word]
                word = lmtzr.lemmatize(word, tag)
            output.append(word)
        return output

    def gettag(query, lemmatag = False):
        """
        Find tag for WordNet lemmatisation
        """
        import re

        tagdict = {'N': 'n',
                   'A': 'a',
                   'V': 'v',
                   'A': 'r',
                   'None': False,
                   '': False,
                   'Off': False}

        if lemmatag is False:
            tag = 'n' # same default as wordnet
            # attempt to find tag from tregex query
            tagfinder = re.compile(r'^[^A-Za-z]*([A-Za-z]*)')
            tagchecker = re.compile(r'^[A-Z]{1,4}$')
            qr = query.replace(r'\w', '').replace(r'\s', '').replace(r'\b', '')
            treebank_tag = re.findall(tagfinder, qr)
            if re.match(tagchecker, treebank_tag[0]):
                tag = tagdict.get(treebank_tag[0], 'n')
        elif lemmatag:
            tag = lemmatag
        return tag

    def format_tregex(results):
        """format tregex by show list"""
        if countmode:
            return results
        import re
        done = []
        if 'l' in show or 'pl' in show:
            lemmata = lemmatiser(results, gettag(search.get('t'), lemmatag))
        else:
            lemmata = [None for i in results]
        for word, lemma in zip(results, lemmata):
            bits = []
            if exclude and exclude.get('w'):
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('w'), word):
                        continue
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('l'), lemma):
                        continue
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('p'), word):
                        continue
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('pl'), lemma):
                        continue
            if exclude and excludemode == 'all':
                num_to_cause_exclude = len(list(exclude.keys()))
                current_num = 0
                if exclude.get('w'):
                    if re.search(exclude.get('w'), word):
                        current_num += 1
                if exclude.get('l'):
                    if re.search(exclude.get('l'), lemma):
                        current_num += 1
                if exclude.get('p'):
                    if re.search(exclude.get('p'), word):
                        current_num += 1
                if exclude.get('pl'):
                    if re.search(exclude.get('pl'), lemma):
                        current_num += 1   
                if current_num == num_to_cause_exclude:
                    continue                 

            for i in show:
                if i == 't':
                    bits.append(word)
                if i == 'l':
                    bits.append(lemma)
                elif i == 'w':
                    bits.append(word)
                elif i == 'p':
                    bits.append(word)
                elif i == 'pl':
                    bits.append(lemma)
            joined = '/'.join(bits)
            done.append(joined)
        return done

    def tok_by_list(pattern, list_of_toks, concordancing = False, **kwargs):
        """search for regex in plaintext corpora"""
        import re
        if type(pattern) == str:
            pattern = [pattern]
        if not case_sensitive:
            pattern = [p.lower() for p in pattern]
        if not concordancing:
            if case_sensitive:
                matches = [m for m in list_of_toks if m in pattern]
            else:
                matches = [m for m in list_of_toks if m.lower() in pattern]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if token in pattern:
                    match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(token)
                    match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140])
                    matches.append(match)
        if countmode:
            return(len(matches))
        else:
            return matches

    def unsplitter(lst):
        """unsplit contractions and apostophes from tokenised text"""
        if split_contractions:
            return lst
        unsplit = []
        for index, t in enumerate(lst):
            if index == 0 or index == len(lst) - 1:
                unsplit.append(t)
                continue
            if "'" in t and not t.endswith("'"):
                rejoined = ''.join([lst[index - 1], t])
                unsplit.append(rejoined)
            else:
                if not "'" in lst[index + 1]:
                    unsplit.append(t)
        return unsplit

    def tok_ngrams(pattern, list_of_toks, concordancing = False, split_contractions = True):
        from collections import Counter
        import re
        ngrams = Counter()
        result = []
        # if it's not a compiled regex
        list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)]
        if pattern.lower() == 'any':
            pattern = r'.*'

        if not split_contractions:
            list_of_toks = unsplitter(list_of_toks)
            
            #list_of_toks = [x for x in list_of_toks if "'" not in x]
        for index, w in enumerate(list_of_toks):
            try:
                the_gram = [list_of_toks[index+x] for x in range(gramsize)]
                if not any(re.search(pattern, x) for x in the_gram):
                    continue
                ngrams[' '.join(the_gram)] += 1
            except IndexError:
                pass

        # turn counter into list of results
        for k, v in list(ngrams.items()):
            if v > 1:
                for i in range(v):
                    result.append(k)
        if countmode:
            return(len(result))
        else:
            return result

    def compiler(pattern):
        """compile regex or fail gracefully"""
        import re
        try:
            if case_sensitive:
                comped = re.compile(pattern)
            else:
                comped = re.compile(pattern, re.IGNORECASE)
            return comped
        except:
            import traceback
            import sys
            from time import localtime, strftime
            exc_type, exc_value, exc_traceback = sys.exc_info()
            lst = traceback.format_exception(exc_type, exc_value,
                          exc_traceback)
            error_message = lst[-1]
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: Query %s' % (thetime, error_message))
            if root:
                return 'Bad query'
            else:
                raise ValueError('%s: Query %s' % (thetime, error_message))

    def tok_by_reg(pattern, list_of_toks, concordancing = False, **kwargs):
        """search for regex in plaintext corpora"""
        import re
        comped = compiler(pattern)
        if comped == 'Bad query':
            return 'Bad query'
        if not concordancing:
            matches = [m for m in list_of_toks if re.search(comped, m)]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if re.search(comped, token):
                    match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(re.search(comped, token).group(0))
                    match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140])
                    matches.append(match)
        if countmode:
            return(len(matches))
        else:
            return matches

    def plaintext_regex_search(pattern, plaintext_data, concordancing = False, **kwargs):
        """search for regex in plaintext corpora

        it searches over lines, so the user needs to be careful.
        """
        import re
        if concordancing:
            pattern = r'(.{,140})\b(' + pattern + r')\b(.{,140})'
        compiled_pattern = compiler(pattern)
        if compiled_pattern == 'Bad query':
            return 'Bad query'
        matches = re.findall(compiled_pattern, plaintext_data)
        if concordancing:
            matches = [list(m) for m in matches]
        if not concordancing:
            for index, i in enumerate(matches):
                if type(i) == tuple:
                    matches[index] = i[0]
        if countmode:
            return(len(matches))
        else:
            return matches

    def correct_spelling(a_string):
        if not spelling:
            return a_string
        from dictionaries.word_transforms import usa_convert
        if spelling.lower() == 'uk':
            usa_convert = {v: k for k, v in list(usa_convert.items())}
        spell_out = []
        bits = a_string.split('/')
        for index, i in enumerate(bits):
            converted = usa_convert.get(i.lower(), i)
            if i.islower() or preserve_case is False:
                converted = converted.lower()
            elif i.isupper() and preserve_case:
                converted = converted.upper()
            elif i.istitle() and preserve_case:
                converted = converted.title()
            bits[index] = converted
        r = '/'.join(bits)
        return r

    def plaintext_simple_search(pattern, plaintext_data, concordancing = False, **kwargs):
        """search for tokens in plaintext corpora"""
        import re
        result = []
        if type(pattern) == str:
            pattern = [pattern]
        for p in pattern:
            if concordancing:
                pat = r'(.{0,140})\b(' + re.escape(p) + r')\b(.{0,140})'
            pat = compiler(pat)
            if pat == 'Bad query':
                return 'Bad query'
            matches = re.findall(pat, plaintext_data)
            if concordancing:
                matches = [list(m) for m in matches]
                for i in matches:
                    result.append(i)
            else:   
                for m in range(len(matches)):
                    result.append(p)
        return result

    # do multiprocessing if need be
    im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers)
    
    locs['search'] = search
    locs['query'] = query
    locs['just_speakers'] = just_speakers
    locs['corpus'] = corpus
    locs['multiprocess'] = multiprocess

    if im:
        from multiprocess import pmultiquery
        return pmultiquery(**locs)

    datatype = corpus.datatype
    singlefile = corpus.singlefile

    # store all results in here
    results = {}
    # check if just counting
    countmode = 'c' in show
    # where we are at in interrogation
    current_iter = 0

    # multiprocessing progress bar
    denom = kwargs.get('denominator', 1)
    startnum = kwargs.get('startnum', 0)

    ############################################
    # Determine the search function to be used #
    ############################################
    
    # simple tregex is tregex over whole dirs
    simple_tregex_mode = False
    statsmode = False
    if not just_speakers and 't' in list(search.keys()):
        simple_tregex_mode = True
    else:
        if corpus.datatype == 'plaintext':
            if search.get('n'):
                raise NotImplementedError('Use a tokenised corpus for n-gramming.')
                #searcher = plaintext_ngram
                optiontext = 'n-grams via plaintext'
            if search.get('w'):
                if kwargs.get('regex', True):
                    searcher = plaintext_regex_search
                else:
                    searcher = plaintext_simple_search
                optiontext = 'Searching plaintext'

        elif corpus.datatype == 'tokens':
            if search.get('n'):
                searcher = tok_ngrams
                optiontext = 'n-grams via tokens'
            elif search.get('w'):
                if kwargs.get('regex', True):
                    searcher = tok_by_reg
                else:
                    searcher = tok_by_list
                if type(search.get('w')) == list:
                    searcher = tok_by_list
                optiontext = 'Searching tokens'
        only_parse = ['r', 'd', 'g', 'dl', 'gl', 'df', 'gf', 'dp', 'gp', 'f']
        if corpus.datatype != 'parse' and any(i in only_parse for i in list(search.keys())):
            raise ValueError('Need parsed corpus to search with "%s" option(s).' % ', '.join([i for i in list(search.keys()) if i in only_parse]))

        elif corpus.datatype == 'parse':
            if search.get('t'):
                searcher = slow_tregex
            elif search.get('s'):
                searcher = get_stats
                statsmode = True
                optiontext = 'General statistics'
                global numdone
                numdone = 0
            else:
                from depsearch import dep_searcher
                searcher = dep_searcher
                optiontext = 'Dependency querying'

    ############################################
    #      Set some Tregex-related values      #
    ############################################

    if search.get('t'):
        query = search.get('t')

        # check the query
        q = tregex_engine(corpus = False, query = search.get('t'), 
                          options = ['-t'], check_query = True, root = root)
        if query is False:
            if root:
                return 'Bad query'
            else:
                return

        optiontext = 'Searching parse trees'
        if 'p' in show or 'pl' in show:
            translated_option = 'u'
            if type(search['t']) == list:
                search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)'
        elif 't' in show:
            translated_option = 'o'
            if type(search['t']) == list:
                search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)'
        elif 'w' in show:
            translated_option = 't'
            if type(search['t']) == list:
                search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'/.?[A-Za-z0-9].?/ !< __'
        elif 'c' in show:
            count_results = {}
            only_count = True
            translated_option = 'C'
            if type(search['t']) == list:
                search['t'] = r'/%s/ !< __'  % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'/.?[A-Za-z0-9].?/ !< __'
        elif 'l' in show:
            translated_option = 't'
            if type(search['t']) == list:
                search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'/.?[A-Za-z0-9].?/ !< __'

        query = search['t']

    ############################################
    # Make iterable for corpus/subcorpus/file  #
    ############################################

    if corpus.singlefile:
        to_iterate_over = {(corpus.name, corpus.path): [corpus]}
    elif not corpus.subcorpora:
        to_iterate_over = {(corpus.name, corpus.path): corpus.files}
    else:
        to_iterate_over = {}
        for k, v in sorted(corpus.structure.items(), key=lambda obj: obj[0].name):
            to_iterate_over[(k.name, k.path)] = v
    if files_as_subcorpora:
        to_iterate_over = {}
        for f in corpus.files:
            to_iterate_over[(f.name, f.path)] = [f]

    ############################################
    #           Print welcome message          #
    ############################################

    if conc:
        message = 'Concordancing'
    else:
        message = 'Interrogating'
    if kwargs.get('printstatus', True):
        thetime = strftime("%H:%M:%S", localtime())

        sformat = '\n                 '.join(['%s: %s' % (k.rjust(3), v) for k, v in list(search.items())])
        if search == {'s': r'.*'}:
            sformat = 'features'
        welcome = '\n%s: %s %s ...\n          %s\n          Query: %s\n' % \
                  (thetime, message, corpus.name, optiontext, sformat)
        print(welcome)

    ############################################
    #           Make progress bar              #
    ############################################

    if simple_tregex_mode:
        total_files = len(list(to_iterate_over.keys()))
    else:
        if search.get('s'):
            total_files = sum([len(x) for x in list(to_iterate_over.values())]) * 12
        else:
            total_files = sum([len(x) for x in list(to_iterate_over.values())])

    par_args = {'printstatus': kwargs.get('printstatus', True),
                'root': root, 
                'note': note,
                'length': total_files}

    term = None
    if kwargs.get('paralleling', None) is not None:
        from blessings import Terminal
        term = Terminal()
        par_args['terminal'] = term
        par_args['linenum'] = kwargs.get('paralleling')

    outn = kwargs.get('outname', '')
    if outn:
        outn = outn + ': '
    tstr = '%s%d/%d' % (outn, current_iter, total_files)
    p = animator(None, None, init = True, tot_string = tstr, **par_args)
    tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
    animator(p, current_iter, tstr, **par_args)

    ############################################
    # Iterate over data, doing interrogations  #
    ############################################

    for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()):

        if countmode or conc:
            results[subcorpus_name] = []
        else:
            results[subcorpus_name] = Counter()
        
        # tregex over subcorpora, not files
        if simple_tregex_mode:

            op = ['-o', '-' + translated_option]                
            result = tregex_engine(query = search['t'], options = op, 
                                   corpus = subcorpus_path, root = root, preserve_case = preserve_case)
            
            current_iter += 1

            if not countmode:
                result = Counter(format_tregex(result))

            if conc:
                op.append('-w')
                whole_result = tregex_engine(query = search['t'], options = op, 
                                   corpus = subcorpus_path, root = root, preserve_case = preserve_case)
                
                if not only_format_match:
                    whole_result = format_tregex(whole_result)

                result = make_conc_lines_from_whole_mid(whole_result, result, speakr = False)

                if spelling:
                    for index, line in enumerate(result):
                        result[index] = [correct_spelling(b) for b in line]

            if countmode:
                results[subcorpus_name].append(result)
            else:
                results[subcorpus_name] += result

            if kwargs.get('paralleling', None) is not None:
                tstr = '%s%d/%d' % (outn, current_iter + 2, total_files)
            else:
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
            animator(p, current_iter, tstr, **par_args)

        # dependencies, plaintext, tokens or slow_tregex
        else:
            for f in files:

                if corpus.datatype == 'parse':
                    with open(f.path, 'r') as data:
                        data = data.read()
                        from corenlp_xml.document import Document
                        try:
                            corenlp_xml = Document(data)
                        except:
                            print('Could not read file: %s' % f.path)
                            continue
                        if just_speakers:  
                            sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers]
                            if not sents:
                                continue
                        else:
                            sents = corenlp_xml.sentences

                        res = searcher(sents, search = search, show = show,
                            dep_type = dep_type,
                            exclude = exclude,
                            excludemode = excludemode,
                            searchmode = searchmode,
                            lemmatise = False,
                            case_sensitive = case_sensitive,
                            concordancing = conc,
                            only_format_match = only_format_match)
                        
                        if res == 'Bad query':
                            return 'Bad query'

                        if searcher == slow_tregex and not countmode:
                            res = format_tregex(res)

                elif corpus.datatype == 'tokens':
                    import pickle
                    with codecs.open(f.path, "rb") as fo:
                        data = pickle.load(fo)
                    res = searcher(list(search.values())[0], data, split_contractions = split_contractions, 
                        concordancing = conc)
                    if conc:
                        for index, line in enumerate(res):
                            line.insert(0, '')

                elif corpus.datatype == 'plaintext':
                    with codecs.open(f.path, 'rb', encoding = 'utf-8') as data:
                        data = data.read()
                        res = searcher(list(search.values())[0], data, 
                            concordancing = conc)
                        if conc:
                            for index, line in enumerate(res):
                                line.insert(0, '')

                if countmode:
                    results[subcorpus_name] += res
                    continue
            
                # add filename and do lowercasing for conc
                if conc:
                    for index, line in enumerate(res):
                        line.insert(0, f.name)
                        if not preserve_case:
                            line = [b.lower() for b in line]
                        if spelling:
                            line = [correct_spelling(b) for b in line]
                        results[subcorpus_name] += [line]

                # do lowercasing and spelling
                else:
                    if not preserve_case:
                        res = [r.lower() for r in res]
                    if spelling:
                        res = [correct_spelling(r) for r in res]
                    results[subcorpus_name] += Counter(res)

                if not statsmode:
                    current_iter += 1
                    if kwargs.get('paralleling', None) is not None:
                        tstr = '%s%d/%d' % (outn, current_iter + 2, total_files)
                    else:
                        tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                    animator(p, current_iter, tstr, **par_args)

    # delete temp file if there
    import os
    if os.path.isfile('tmp.txt'):
        os.remove('tmp.txt')

    ############################################
    #     Get concordances into DataFrame      #
    ############################################

    if conc:
        all_conc_lines = []
        for sc_name, resu in sorted(results.items()):

            if only_unique:
                unique_results = uniquify(resu)
            else:
                unique_results = resu
            #make into series
            pindex = 'c f s l m r'.encode('utf-8').split()
            for fname, spkr, start, word, end in unique_results:
                #spkr = str(spkr, errors = 'ignore')
                fname = os.path.basename(fname)

                # the use of ascii here makes sure the string formats ok, but will also screw over
                # anyone doing non-english work. so, change to utf-8, then fix errors as they come
                # in the corpkit-gui "add_conc_lines_to_window" function
                all_conc_lines.append(Series([sc_name,
                                     fname, \
                                     spkr, \
                                     start, \
                                     word, \
                                     end], \
                                     index = pindex))

        # randomise results...
        if random:
            from random import shuffle
            shuffle(all_conc_lines)

        df = pd.concat(all_conc_lines, axis = 1).T

        # not doing anything yet --- this is for multimodal concordancing
        add_links = False
        if not add_links:
            df.columns = ['c', 'f', 's', 'l', 'm', 'r']
        else:
            df.columns = ['c', 'f', 's', 'l', 'm', 'r', 'link']

        if all(x == '' for x in list(df['s'].values)):
            df.drop('s', axis = 1, inplace = True)

        if kwargs.get('note'):
            kwargs['note'].progvar.set(100)

        if kwargs.get('printstatus', True):
            thetime = strftime("%H:%M:%S", localtime())
            finalstring = '\n\n%s: Concordancing finished! %d matches.\n' % (thetime, len(df.index))
            print(finalstring)

        from interrogation import Concordance
        output = Concordance(df)
        output.query = locs
        if quicksave:
            interro.save()
        return output 

    ############################################
    #     Get interrogation into DataFrame     #
    ############################################

    else:
        if countmode:
            df = Series({k: sum(v) for k, v in sorted(results.items())})
            tot = df.sum()
        else:
            the_big_dict = {}
            unique_results = set([item for sublist in list(results.values()) for item in sublist])
            for word in unique_results:
                the_big_dict[word] = [subcorp_result[word] for name, subcorp_result in sorted(results.items(), key=lambda x: x[0])]
            # turn master dict into dataframe, sorted
            df = DataFrame(the_big_dict, index = sorted(results.keys()))

            numentries = len(df.columns)
            tot = df.sum(axis = 1)
            total_total = df.sum().sum()

        ############################################
        # Format, output as Interrogation object   #
        ############################################

        if not countmode:
            if not corpus.subcorpora or singlefile:
                if not files_as_subcorpora:
                    if not kwargs.get('df1_always_df'):
                        df = Series(df.ix[0])
                        df.sort(ascending = False)
                        tot = df.sum()
                        numentries = len(df.index)
                        total_total = tot


        # sort by total
        if type(df) == pd.core.frame.DataFrame:
            if not df.empty:   
                df.ix['Total-tmp'] = df.sum()
                the_tot = df.ix['Total-tmp']
                df = df[the_tot.argsort()[::-1]]
                df = df.drop('Total-tmp', axis = 0)

        # format final string
        if kwargs.get('printstatus', True):
            thetime = strftime("%H:%M:%S", localtime())
            finalstring = '\n\n%s: Interrogation finished!' % thetime
            if countmode:
                finalstring += ' %d matches.' % tot
            else:
                finalstring += ' %d unique results, %d total occurrences.' % (numentries, total_total)
            print(finalstring)

        interro = Interrogation(results = df, totals = tot, query = locs)
        
        if quicksave:
            interro.save()
        
        return interro
Example #23
0
    def test_value_counts_inferred(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
            s = klass(s_values)
            expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(s.unique(), np.unique(s_values))
            self.assertEqual(s.nunique(), 4)
            # don't sort, have to sort after the fact as not sorting is platform-dep
            hist = s.value_counts(sort=False)
            hist.sort()
            expected = Series([3, 1, 4, 2], index=list("acbd"))
            expected.sort()
            tm.assert_series_equal(hist, expected)

            # sort ascending
            hist = s.value_counts(ascending=True)
            expected = Series([1, 2, 3, 4], index=list("cdab"))
            tm.assert_series_equal(hist, expected)

            # relative histogram.
            hist = s.value_counts(normalize=True)
            expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"])
            tm.assert_series_equal(hist, expected)

            # bins
            self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({0.998: 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({0.998: 1.0})
            tm.assert_series_equal(res1n, exp1n)

            self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3]))
            self.assertEqual(s1.nunique(), 3)

            res4 = s1.value_counts(bins=4)
            exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4, exp4)
            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"]
            s = klass(s_values)
            expected = Series([4, 3, 2], index=["b", "a", "d"])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(s.unique(), np.array(["a", "b", np.nan, "d"], dtype="O"))
            self.assertEqual(s.nunique(), 3)

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected)
            self.assert_numpy_array_equal(s.unique(), np.array([]))
            self.assertEqual(s.nunique(), 0)

            # GH 3002, datetime64[ns]
            txt = "\n".join(
                [
                    "xxyyzz20100101PIE",
                    "xxyyzz20100101GUM",
                    "xxyyzz20100101EGG",
                    "xxyyww20090101EGG",
                    "foofoo20080909PIE",
                    "foofoo20080909GUM",
                ]
            )
            f = StringIO(txt)
            df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"])

            s = klass(df["dt"].copy(), name="dt")

            idx = pd.to_datetime(["2010-01-01 00:00:00Z", "2008-09-09 00:00:00Z", "2009-01-01 00:00:00X"])
            expected_s = Series([3, 2, 1], index=idx, name="dt")
            tm.assert_series_equal(s.value_counts(), expected_s)

            expected = np.array(
                ["2010-01-01 00:00:00Z", "2009-01-01 00:00:00Z", "2008-09-09 00:00:00Z"], dtype="datetime64[ns]"
            )
            if isinstance(s, DatetimeIndex):
                expected = DatetimeIndex(expected)
                self.assertTrue(s.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(s.unique(), expected)

            self.assertEqual(s.nunique(), 3)

            # with NaT
            s = df["dt"].copy()
            s = klass([v for v in s.values] + [pd.NaT], name="dt")

            result = s.value_counts()
            self.assertEqual(result.index.dtype, "datetime64[ns]")
            tm.assert_series_equal(result, expected_s)

            result = s.value_counts(dropna=False)
            expected_s[pd.NaT] = 1
            tm.assert_series_equal(result, expected_s)

            unique = s.unique()
            self.assertEqual(unique.dtype, "datetime64[ns]")
            # numpy_array_equal cannot compare pd.NaT
            self.assert_numpy_array_equal(unique[:3], expected)
            self.assertTrue(unique[3] is pd.NaT or unique[3].astype("int64") == pd.tslib.iNaT)

            self.assertEqual(s.nunique(), 3)
            self.assertEqual(s.nunique(dropna=False), 4)

            # timedelta64[ns]
            td = df.dt - df.dt + timedelta(1)
            td = klass(td, name="dt")

            result = td.value_counts()
            expected_s = Series([6], index=[Timedelta("1day")], name="dt")
            tm.assert_series_equal(result, expected_s)

            expected = TimedeltaIndex(["1 days"])
            if isinstance(td, TimedeltaIndex):
                self.assertTrue(td.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(td.unique(), expected.values)

            td2 = timedelta(1) + (df.dt - df.dt)
            td2 = klass(td2, name="dt")
            result2 = td2.value_counts()
            tm.assert_series_equal(result2, expected_s)
Example #24
0
ser1

ser1.sort_index()
ser1

ser1.order()

from numpy.random import randn

ser2 = Series(randn(10))
ser2

ser2.rank()
ser2

ser2.sort()
ser2

ser2.rank()

#------------------
# lec022
#------------------
arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]])
arr

dframe1 = DataFrame(arr, index=['A', 'B'], columns=['One', 'Two', 'Three'])
dframe1

# columnで集約
dframe1.sum()
Example #25
0
def plot_lowlevel(plot_spec: pd.Series,
                  ax: 'matplotlib.pyplot.Axes',
                  conditions: pd.Series,
                  ms: pd.DataFrame,
                  plot_sim: bool) -> 'matplotlib.pyplot.Axes':
    """
    Plotting routine / preparations: set properties of figure and plot
    the data with given specifications (lineplot with errorbars, or barplot)

    Parameters:

        plot_spec:
            contains defined data format (visualization file)
        ax:
            axes to which to plot
        conditions:
            Values on x-axis
        ms:
            contains measurement data which should be plotted
        plot_sim:
            tells whether or not simulated data should be plotted as well

    Returns:
        Updated axis object.
    """
    warnings.warn("This function will be removed in future releases. ",
                  DeprecationWarning)

    # set yScale
    if plot_spec[Y_SCALE] == LIN:
        ax.set_yscale("linear")
    elif plot_spec[Y_SCALE] == LOG10:
        ax.set_yscale("log")
    elif plot_spec[Y_SCALE] == LOG:
        ax.set_yscale("log", base=np.e)

    # add yOffset
    ms.loc[:, 'mean'] = ms['mean'] + plot_spec[Y_OFFSET]
    ms.loc[:, 'repl'] = ms['repl'] + plot_spec[Y_OFFSET]
    if plot_sim:
        ms.loc[:, 'sim'] = ms['sim'] + plot_spec[Y_OFFSET]

    # set type of noise
    if plot_spec[PLOT_TYPE_DATA] == MEAN_AND_SD:
        noise_col = 'sd'
    elif plot_spec[PLOT_TYPE_DATA] == MEAN_AND_SEM:
        noise_col = 'sem'
    elif plot_spec[PLOT_TYPE_DATA] == PROVIDED:
        noise_col = 'noise_model'

    if plot_spec.plotTypeSimulation == LINE_PLOT:

        # set xScale
        if plot_spec[X_SCALE] == LIN:
            ax.set_xscale("linear")
        elif plot_spec[X_SCALE] == LOG10:
            ax.set_xscale("log")
        elif plot_spec[X_SCALE] == LOG:
            ax.set_xscale("log", base=np.e)
        # equidistant
        elif plot_spec[X_SCALE] == 'order':
            ax.set_xscale("linear")
            # check if conditions are monotone decreasing or increasing
            if np.all(np.diff(conditions) < 0):             # monot. decreasing
                xlabel = conditions[::-1]                   # reversing
                conditions = range(len(conditions))[::-1]   # reversing
                ax.set_xticks(range(len(conditions)), xlabel)
            elif np.all(np.diff(conditions) > 0):
                xlabel = conditions
                conditions = range(len(conditions))
                ax.set_xticks(range(len(conditions)), xlabel)
            else:
                raise ValueError('Error: x-conditions do not coincide, '
                                 'some are mon. increasing, some monotonically'
                                 ' decreasing')

        # add xOffset
        conditions = conditions + plot_spec[X_OFFSET]

        # plotting all measurement data
        label_base = plot_spec[LEGEND_ENTRY]
        if plot_spec[PLOT_TYPE_DATA] == REPLICATE:
            p = ax.plot(
                conditions[conditions.index.values],
                ms.repl[ms.repl.index.values], 'x',
                label=label_base
            )

        # construct errorbar-plots: noise specified above
        else:
            # sort index for the case that indices of conditions and
            # measurements differ if indep_var='time', conditions is a numpy
            # array, for indep_var=observable its a Series
            if isinstance(conditions, np.ndarray):
                conditions.sort()
            elif isinstance(conditions, pd.core.series.Series):
                conditions.sort_index(inplace=True)
            else:
                raise ValueError('Strange: conditions object is neither numpy'
                                 ' nor series...')
            ms.sort_index(inplace=True)
            # sorts according to ascending order of conditions
            scond, smean, snoise = \
                zip(*sorted(zip(conditions, ms['mean'], ms[noise_col])))
            p = ax.errorbar(
                scond, smean, snoise,
                linestyle='-.', marker='.', label=label_base
            )
        # construct simulation plot
        colors = p[0].get_color()
        if plot_sim:
            xs, ys = zip(*sorted(zip(conditions, ms['sim'])))
            ax.plot(
                xs, ys, linestyle='-', marker='o',
                label=label_base + " simulation", color=colors
            )

    # construct bar plot
    elif plot_spec[PLOT_TYPE_SIMULATION] == BAR_PLOT:
        x_name = plot_spec[LEGEND_ENTRY]

        if plot_sim:
            bar_kwargs = {
                'align': 'edge',
                'width': -1/3,
            }
        else:
            bar_kwargs = {
                'align': 'center',
                'width': 2/3,
            }

        p = ax.bar(x_name, ms['mean'], yerr=ms[noise_col],
                   color=sns.color_palette()[0], **bar_kwargs)

        if plot_sim:
            colors = p[0].get_facecolor()
            bar_kwargs['width'] = -bar_kwargs['width']
            ax.bar(x_name, ms['sim'], color='white',
                   edgecolor=colors, **bar_kwargs)

    # construct scatter plot
    elif plot_spec[PLOT_TYPE_SIMULATION] == SCATTER_PLOT:
        if not plot_sim:
            raise NotImplementedError('Scatter plots do not work without'
                                      ' simulation data')
        ax.scatter(ms['mean'], ms['sim'],
                   label=plot_spec[LEGEND_ENTRY])
        ax = square_plot_equal_ranges(ax)

    # show 'e' as basis not 2.7... in natural log scale cases
    def ticks(y, _):
        return r'$e^{{{:.0f}}}$'.format(np.log(y))
    if plot_spec[X_SCALE] == LOG:
        ax.xaxis.set_major_formatter(mtick.FuncFormatter(ticks))
    if plot_spec[Y_SCALE] == LOG:
        ax.yaxis.set_major_formatter(mtick.FuncFormatter(ticks))

    # set further plotting/layout settings

    if not plot_spec[PLOT_TYPE_SIMULATION] == BAR_PLOT:
        ax.legend()
    ax.set_title(plot_spec[PLOT_NAME])
    ax.relim()
    ax.autoscale_view()

    return ax
Example #26
0
ser1.sort_index()

# In[3]:

ser1.order()

# In[4]:

from numpy.random import randn
ser2 = Series(randn(10))
ser2

# In[6]:

ser2.sort(inplace=True)

# In[7]:

ser2.rank()

# In[8]:

ser3 = Series(randn(10))
ser3

# In[9]:

ser3.rank()

# In[ ]:
Example #27
0
meta['month'] = meta['date'].apply(lambda x: x.replace(day=1))


# Get captions for images.
topics_dict = {}
with open(doc_topic_words_filepath) as f:
    for line in f:
        data = line.strip().split(',')
        topic = data[0]
        words = data[1:]
        topics_dict[topic] = words


# Write out images.
topics = topics_dict.keys()
topics.sort()


index = []
for year in range(1973, 1977):
    for month in range(1, 13):
        date = datetime.datetime.strptime("1/%s/%s" % (month, year),
                                          "%d/%m/%Y")
        index.append(date)

important_classifications = ['CONFIDENTIAL', 'UNCLASSIFIED',
                             'LIMITED OFFICIAL USE', 'SECRET']

colors = {'CONFIDENTIAL'            : 'm-',
          'UNCLASSIFIED'            : 'c-',
          'LIMITED OFFICIAL USE'    : 'y-',
Example #28
0
def statistics(request):
    """
    This function is called when the Statistics button is pressed by the user. It's purpose is to
    take the selected platforms as well as some statistical parameters and perform two
    statistical functions: a T-Test and an FDR analysis

    :param request:
    :return: a rendered HTML page.
    """
    cutoff_type = request.GET.get('cutoff_type')
    cutoff_value = float(request.GET.get('cutoff_value'))
    display_values = request.session.get('display_values', {})
    spps = request.GET.get('spps')
    spps = spps.split(',')
    combined_series = []
    display_profile = None
    for spp in spps:
        _, study, display_profile, platform = spp.split('|')
        profile = display_profile.replace('_', '-')
        sample_ids = geo_data.get_sample_ids(study, profile, platform)
        control_sample_ids = []
        diseased_sample_ids = []
        for sample_id in sample_ids:
            sample_attributes = geo_data.get_sample_attributes(study, profile, platform, sample_id)
            if sample_attributes['control']:
                control_sample_ids.append(sample_id)
            else:
                diseased_sample_ids.append(sample_id)

        genes = geo_data.get_all_gene_symbols(study, profile, platform)
        no_of_genes = len(genes)
        control_exprs = zeros((no_of_genes, len(control_sample_ids)))
        diseased_exprs = zeros((no_of_genes, len(diseased_sample_ids)))

        for (g_index, gene) in enumerate(genes):
            gene_exprs = zeros(len(control_sample_ids))
            for (s_index, sample_id) in enumerate(control_sample_ids):
                expr_value = geo_data.get_gene_expression_value(study, profile, platform, sample_id, gene)
                if expr_value == 'None':
                    continue
                gene_exprs[s_index] = expr_value
            control_exprs[g_index] = gene_exprs

            gene_exprs = zeros(len(diseased_sample_ids))
            for (s_index, sample_id) in enumerate(diseased_sample_ids):
                expr_value = geo_data.get_gene_expression_value(study, profile, platform, sample_id, gene)
                if expr_value == 'None':
                    continue
                gene_exprs[s_index] = expr_value
            diseased_exprs[g_index] = gene_exprs

        control_df = DataFrame(control_exprs, index=genes, columns=control_sample_ids)
        diseased_df = DataFrame(diseased_exprs, index=genes, columns=diseased_sample_ids)

        # Perform the the t-test and create a pandas Series
        t_statistics, p_values = ttest_ind(control_df.T, diseased_df.T)
        p_values_series = Series(p_values, index=genes)

        # Perform the fdr analysis, create a pandas Series and sort the series
        reject_fdr, pval_fdr = fdr_correction(p_values_series, method='indep')
        fdr_values_series = Series(pval_fdr, index=genes)
        p_values_series.sort(ascending=True)

        combined_series = []
        for i in range(len(p_values_series)):
            symbol = p_values_series.index[i]
            p_value = p_values_series[i]
            if cutoff_type == 'p-value' and p_value > cutoff_value:
                break
            fdr_value = fdr_values_series.get(symbol)
            if cutoff_type == 'fdr-value' and fdr_value > cutoff_value:
                break
            combined_series.append([symbol, p_value, fdr_value])

        display_values[display_profile] = combined_series

    request.session['display_values'] = display_values
    response = render_to_string('statistics.html',
                                {display_profile: combined_series})

    return HttpResponse(response)
Example #29
0
    def test_value_counts_inferred(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)
            expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(s.unique(), np.unique(s_values))
            self.assertEqual(s.nunique(), 4)
            # don't sort, have to sort after the fact as not sorting is platform-dep
            hist = s.value_counts(sort=False)
            hist.sort()
            expected = Series([3, 1, 4, 2], index=list('acbd'))
            expected.sort()
            tm.assert_series_equal(hist, expected)

            # sort ascending
            hist = s.value_counts(ascending=True)
            expected = Series([1, 2, 3, 4], index=list('cdab'))
            tm.assert_series_equal(hist, expected)

            # relative histogram.
            hist = s.value_counts(normalize=True)
            expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(hist, expected)

            # bins
            self.assertRaises(TypeError,
                              lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({0.998: 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({0.998: 1.0})
            tm.assert_series_equal(res1n, exp1n)

            self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3]))
            self.assertEqual(s1.nunique(), 3)

            res4 = s1.value_counts(bins=4)
            exp4 = Series({
                0.998: 2,
                1.5: 1,
                2.0: 0,
                2.5: 1
            },
                          index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4, exp4)
            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series({
                0.998: 0.5,
                1.5: 0.25,
                2.0: 0.0,
                2.5: 0.25
            },
                           index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = [
                'a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'
            ]
            s = klass(s_values)
            expected = Series([4, 3, 2], index=['b', 'a', 'd'])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(
                s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O'))
            self.assertEqual(s.nunique(), 3)

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected)
            self.assert_numpy_array_equal(s.unique(), np.array([]))
            self.assertEqual(s.nunique(), 0)

            # GH 3002, datetime64[ns]
            txt = "\n".join([
                'xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG',
                'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM'
            ])
            f = StringIO(txt)
            df = pd.read_fwf(f,
                             widths=[6, 8, 3],
                             names=["person_id", "dt", "food"],
                             parse_dates=["dt"])

            s = klass(df['dt'].copy())

            idx = pd.to_datetime([
                '2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z',
                '2009-01-01 00:00:00X'
            ])
            expected_s = Series([3, 2, 1], index=idx)
            tm.assert_series_equal(s.value_counts(), expected_s)

            expected = np.array([
                '2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z',
                '2008-09-09 00:00:00Z'
            ],
                                dtype='datetime64[ns]')
            if isinstance(s, DatetimeIndex):
                expected = DatetimeIndex(expected)
                self.assertTrue(s.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(s.unique(), expected)

            self.assertEqual(s.nunique(), 3)

            # with NaT
            s = df['dt'].copy()
            s = klass([v for v in s.values] + [pd.NaT])

            result = s.value_counts()
            self.assertEqual(result.index.dtype, 'datetime64[ns]')
            tm.assert_series_equal(result, expected_s)

            result = s.value_counts(dropna=False)
            expected_s[pd.NaT] = 1
            tm.assert_series_equal(result, expected_s)

            unique = s.unique()
            self.assertEqual(unique.dtype, 'datetime64[ns]')
            # numpy_array_equal cannot compare pd.NaT
            self.assert_numpy_array_equal(unique[:3], expected)
            self.assertTrue(unique[3] is pd.NaT
                            or unique[3].astype('int64') == pd.tslib.iNaT)

            self.assertEqual(s.nunique(), 3)
            self.assertEqual(s.nunique(dropna=False), 4)

            # timedelta64[ns]
            td = df.dt - df.dt + timedelta(1)
            td = klass(td)

            result = td.value_counts()
            expected_s = Series([6], index=[Timedelta('1day')])
            tm.assert_series_equal(result, expected_s)

            expected = TimedeltaIndex(['1 days'])
            if isinstance(td, TimedeltaIndex):
                self.assertTrue(td.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(td.unique(), expected.values)

            td2 = timedelta(1) + (df.dt - df.dt)
            td2 = klass(td2)
            result2 = td2.value_counts()

            tm.assert_series_equal(result2, expected_s)
Example #30
0
###															###
###															###
###############################################################

# go to http://pandas.pydata.org/pandas-docs/stable/cookbook.html for several examples

df3 + df4 #adds dataframes
df4.add(df3,fill_value=0) # does the same thing, and replaces NaNs with 0

ser3 = df3.ix[0] # forming a series from a dataframe. Here the first row is returned as axis

ser3.sort_index() # sorts according to index

ser5 = ser4.order() # sorts according to value, but is NOT in place

ser4.sort() ## in place sorting 

df1.sum() #sum columns

df1.sum(axis = 1) # sum rows

df1.min() # minimum values across columns

df1.idxmin() #index of the minimum values

df1.cumsum() # returns dataframe with cumulative sums across columns

df1.describe() # returns summary stats across columns

df.drop_duplicates() # drops duplicate rows
Example #31
0
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
ser1 = Series(range(3), index=['C', 'A', 'B'])
ser2 = ser1.sort_index()
print(ser2)
print(ser1.order())
from numpy.random import randn
ser3 = Series(randn(10))
print(ser3.rank())
print(ser3.sort())
Example #32
0
def check_close_price_by_variety_id(variety_id, instrument_list):
    global month_str, pre_month_str
    global close_limit_update_time
    sixth_instrument = None
    quote_map = {}
    for one_file_name in instrument_list:
        quote_file = open(one_file_name, "r")
        quote_list = quote_file.readlines()
        quote_file.close()
        instrument_id = one_file_name.split("\\")[-1].split(".")[0]
        close_quote = CBest_Market_Data_Field()
        if len(quote_list) > 3:
            if len(quote_list[-1]) > 2:
                close_quote = Get_CBest_Market_Data_Field_From_Line(
                    quote_list[-2])
            else:
                close_quote = Get_CBest_Market_Data_Field_From_Line(
                    quote_list[-3])
            quote_map[instrument_id] = close_quote

    if len(quote_map) > 2:
        # 寻找主力合约
        best_quote_frame = Series()
        for (instrument_id, close_quote) in quote_map.items():
            if instrument_id[-2:] != month_str:
                best_quote = Series(
                    [instrument_id, close_quote.Total_Match_Volume])
                best_quote_frame = best_quote_frame.append(best_quote)
        best_quote_frame = Series(best_quote_frame[1].values,
                                  index=best_quote_frame[0].values)
        best_quote_frame.sort()
        main_instrument = best_quote_frame.index[-1]
        sub_instrument = best_quote_frame.index[-2]
        ssub_instrument = best_quote_frame.index[-3]
        forth_instrument = best_quote_frame.index[-4]
        fifth_instrument = best_quote_frame.index[-5]
        if len(best_quote_frame.index) > 5:
            sixth_instrument = best_quote_frame.index[-6]
        first_instrument = main_instrument
        second_instrument = sub_instrument
        trigger_ref_spread_price = (float(quote_map[main_instrument].Bid_Price1) + float(quote_map[main_instrument].Ask_Price1)) / 2 - \
                                   (float(quote_map[second_instrument].Bid_Price1) + float(quote_map[second_instrument].Ask_Price1)) / 2
        tick, exchange_id = get_attribution(variety_id)

        if second_instrument[-2:] == pre_month_str:
            trigger_price_range = trigger_price_range_dict[variety_id][
                0] + 10 * tick
        else:
            trigger_price_range = trigger_price_range_dict[variety_id][0]

        extreme_price_range = extreme_price_range_dict[variety_id][0]
        open_order_volume = open_order_volume_dict[variety_id][0]
        max_open_order_volume = max_open_order_volume_dict[variety_id][0]
        stop_tick = stop_tick_dict[variety_id][0] * tick
        line4 = "\t\t\t<main_instrument>" + main_instrument + "</main_instrument>\n"
        line6 = "\t\t\t<sub_instrument>" + second_instrument + "</sub_instrument>\n"
        line7 = "\t\t\t<open_order_volume>" + str(
            int(open_order_volume)) + "</open_order_volume>\n"
        line71 = "\t\t\t<stop_tick>" + str(float(stop_tick)) + "</stop_tick>\n"
        line8 = "\t\t\t<tigger_ref_spread_price>" + str(
            trigger_ref_spread_price) + "</tigger_ref_spread_price>\n"
        line9 = "\t\t\t<tigger_price_range>" + str(
            int(trigger_price_range)) + "</tigger_price_range>\n"
        line91 = "\t\t\t<price_multiple>" + str(
            float(tick) * 2) + "</price_multiple>\n"
        line92 = "\t\t\t<order_multiple>" + str(
            int(math.ceil(
                float(open_order_volume) / 2))) + "</order_multiple>\n"
        line93 = "\t\t\t<max_open_order_volume>" + str(
            int(max_open_order_volume)) + "</max_open_order_volume>\n"
        line10 = "\t\t</auction_arbi_variety>\n"
        line_add1 = "\t\t\t<open_over_take>" + str(
            float(tick)) + "</open_over_take>\n"
        line_add2 = "\t\t\t<extreme_price_range>" + str(
            int(extreme_price_range)) + "</extreme_price_range>\n"
        line_add4 = "\t\t\t<main_order_volume>" + str(
            int(main_order_volume)) + "</main_order_volume>\n"
        line_add5 = "\t\t\t<main_lost_tick>" + str(
            float(main_lost_tick_num * tick)) + "</main_lost_tick>\n"
        line_add6 = "\t\t\t<total_volume_limit_ratio>" + str(
            float(total_volume_limit_ratio)) + "</total_volume_limit_ratio>\n"
        line_add7 = "\t\t\t<spam_num>" + str(int(spam_num)) + "</spam_num>\n"
        line_add8 = "\t\t\t<first_level_volume>" + str(
            int(first_level_volume)) + "</first_level_volume>\n"
        line_add9 = "\t\t\t<second_level_volume>" + str(
            int(second_level_volume)) + "</second_level_volume>\n"
        line_add10 = "\t\t\t<third_level_volume>" + str(
            int(third_level_volume)) + "</third_level_volume>\n"
        line_add11 = "\t\t\t<reserved_profit>" + str(
            reserved_tick * float(tick)) + "</reserved_profit>\n"
        if exchange_id == 'SHFE':
            _, mean_volume = get_open_volume_series(main_instrument,
                                                    trading_day)
            if variety_id not in latency_variety_id_array:
                line3 = "\t\t<auction_arbi_variety  exchange_id=\"" + exchange_id + "\" variety_id=\"" + variety_id + \
                        "\" tick=\"" + str(float(tick)) + "\" trigger_time=\"20:58:59.700\"" + " try_czce_order_time=\"20:58:30.000\">\n"
            else:
                line3 = "\t\t<auction_arbi_variety  exchange_id=\"" + exchange_id + "\" variety_id=\"" + variety_id + \
                        "\" tick=\"" + str(float(tick)) + "\" trigger_time=\"20:58:59.400\"" + " try_czce_order_time=\"20:58:30.000\">\n"
                line93 = "\t\t\t<max_open_order_volume>" + str(
                    int(max_open_order_volume /
                        2)) + "</max_open_order_volume>\n"
            line_add3 = "\t\t\t<main_average_his_volume>" + str(
                int(mean_volume * open_volume_ma_limit_ratio)
            ) + "</main_average_his_volume>\n"
            print>> reach_limit_price_result_file, line3, line4, line6, line7, line71, line8, line9, line91, line_add1, line_add2, line92, \
                line93, line_add3, line_add4, line_add5, line_add6, line_add7, line_add8, line_add9, line_add10, line_add11, line10
            reach_limit_price_result_file.flush()
        else:
            line3 = "\t\t<auction_arbi_variety  exchange_id=\"" + exchange_id + "\" variety_id=\"" + variety_id + \
                    "\" tick=\"" + str(float(tick)) + "\" trigger_time=\"20:58:59.450\"" + " try_czce_order_time=\"20:58:30.000\">\n"
            line_add3 = "\t\t\t<main_average_his_volume>" + str(
                200) + "</main_average_his_volume>\n"
            print>> reach_limit_price_result_file, line3, line4, line6, line7, line71, line8, line9, line91, line_add1, line_add2, line92, \
                line93, line_add3, line_add4, line_add5, line_add6, line_add7, line_add8, line_add9, line_add10, line_add11, line10
            reach_limit_price_result_file.flush()

        if variety_id in two_variety_list:
            second_instrument = ssub_instrument
            trigger_price_range = trigger_price_range_dict[variety_id][1]
            trigger_ref_spread_price = (float(quote_map[main_instrument].Bid_Price1) + float(quote_map[main_instrument].Ask_Price1)) / 2 - \
                                   (float(quote_map[second_instrument].Bid_Price1) + float(quote_map[second_instrument].Ask_Price1)) / 2
            ref_spread_last_price = float(
                quote_map[main_instrument].Last_Price) - float(
                    quote_map[second_instrument].Last_Price)
            bid_ask_spread = abs(ref_spread_last_price -
                                 trigger_ref_spread_price) / tick
            if bid_ask_spread < 15 and quote_map[
                    second_instrument].Update_Time > close_limit_update_time:
                line6 = "\t\t\t<sub_instrument>" + second_instrument + "</sub_instrument>\n"
                line7 = "\t\t\t<open_order_volume>" + str(
                    int(math.ceil(float(open_order_volume) /
                                  2))) + "</open_order_volume>\n"
                line8 = "\t\t\t<tigger_ref_spread_price>" + str(
                    trigger_ref_spread_price) + "</tigger_ref_spread_price>\n"
                line9 = "\t\t\t<tigger_price_range>" + str(
                    int(trigger_price_range)) + "</tigger_price_range>\n"
                line91 = "\t\t\t<price_multiple>" + str(
                    float(tick) * 3) + "</price_multiple>\n"
                line93 = "\t\t\t<max_open_order_volume>" + str(
                    int(max_open_order_volume)) + "</max_open_order_volume>\n"
                line_add1 = "\t\t\t<open_over_take>" + str(
                    float(tick) * 3) + "</open_over_take>\n"
                print>> reach_limit_price_result_file, line3, line4, line6, line7, line71, line8, line9, line91, line_add1, line_add2, line92, \
                    line93, line_add3,  line_add4, line_add5, line_add6, line_add7, line_add8, line_add9, line_add10,line_add11, line10
                reach_limit_price_result_file.flush()

        if variety_id in three_variety_list:
            second_instrument = forth_instrument
            trigger_price_range = trigger_price_range_dict[variety_id][2]
            trigger_ref_spread_price = (float(quote_map[main_instrument].Bid_Price1) + float(quote_map[main_instrument].Ask_Price1)) / 2 - \
                                   (float(quote_map[second_instrument].Bid_Price1) + float(quote_map[second_instrument].Ask_Price1)) / 2
            ref_spread_last_price = float(
                quote_map[main_instrument].Last_Price) - float(
                    quote_map[second_instrument].Last_Price)
            bid_ask_spread = abs(ref_spread_last_price -
                                 trigger_ref_spread_price) / tick
            if bid_ask_spread < 15 and quote_map[
                    second_instrument].Update_Time > close_limit_update_time:
                line6 = "\t\t\t<sub_instrument>" + second_instrument + "</sub_instrument>\n"
                line7 = "\t\t\t<open_order_volume>" + str(
                    int(math.ceil(float(open_order_volume) /
                                  3))) + "</open_order_volume>\n"
                line8 = "\t\t\t<tigger_ref_spread_price>" + str(
                    trigger_ref_spread_price) + "</tigger_ref_spread_price>\n"
                line9 = "\t\t\t<tigger_price_range>" + str(
                    int(trigger_price_range)) + "</tigger_price_range>\n"
                line91 = "\t\t\t<price_multiple>" + str(
                    float(tick) * 5) + "</price_multiple>\n"
                line93 = "\t\t\t<max_open_order_volume>" + str(
                    int(max_open_order_volume)) + "</max_open_order_volume>\n"
                line_add1 = "\t\t\t<open_over_take>" + str(
                    float(tick) * 5) + "</open_over_take>\n"
                print>> reach_limit_price_result_file, line3, line4, line6, line7, line71, line8, line9, line91, line_add1, line_add2, line92, \
                    line93, line_add3, line_add4, line_add5, line_add6, line_add7, line_add8, line_add9, line_add10,line_add11, line10
                reach_limit_price_result_file.flush()

        if variety_id in forth_variety_list:
            second_instrument = fifth_instrument
            trigger_price_range = trigger_price_range_dict[variety_id][3]
            trigger_ref_spread_price = (float(quote_map[main_instrument].Bid_Price1) + float(quote_map[main_instrument].Ask_Price1)) / 2 - \
                                   (float(quote_map[second_instrument].Bid_Price1) + float(quote_map[second_instrument].Ask_Price1)) / 2
            ref_spread_last_price = float(
                quote_map[main_instrument].Last_Price) - float(
                    quote_map[second_instrument].Last_Price)
            bid_ask_spread = abs(ref_spread_last_price -
                                 trigger_ref_spread_price) / tick
            if bid_ask_spread < 15 and quote_map[
                    second_instrument].Update_Time > close_limit_update_time:
                line6 = "\t\t\t<sub_instrument>" + second_instrument + "</sub_instrument>\n"
                line7 = "\t\t\t<open_order_volume>" + str(
                    int(math.ceil(float(open_order_volume) /
                                  5))) + "</open_order_volume>\n"
                line8 = "\t\t\t<tigger_ref_spread_price>" + str(
                    trigger_ref_spread_price) + "</tigger_ref_spread_price>\n"
                line9 = "\t\t\t<tigger_price_range>" + str(
                    int(trigger_price_range)) + "</tigger_price_range>\n"
                line91 = "\t\t\t<price_multiple>" + str(
                    float(tick) * 8) + "</price_multiple>\n"
                line93 = "\t\t\t<max_open_order_volume>" + str(
                    int(max_open_order_volume)) + "</max_open_order_volume>\n"
                line_add1 = "\t\t\t<open_over_take>" + str(
                    float(tick) * 5) + "</open_over_take>\n"
                print>> reach_limit_price_result_file, line3, line4, line6, line7, line71, line8, line9, line91, line_add1, line_add2, line92, \
                    line93, line_add3, line_add4, line_add5, line_add6, line_add7, line_add8, line_add9, line_add10,line_add11, line10
                reach_limit_price_result_file.flush()

        if variety_id in fifth_variety_list:
            ref_spread_last_price = float(
                quote_map[main_instrument].Last_Price) - float(
                    quote_map[second_instrument].Last_Price)
            bid_ask_spread = abs(ref_spread_last_price -
                                 trigger_ref_spread_price) / tick
            if bid_ask_spread < 15 and quote_map[
                    second_instrument].Update_Time > close_limit_update_time:
                second_instrument = sixth_instrument
                trigger_price_range = trigger_price_range_dict[variety_id][4]
                trigger_ref_spread_price = (float(quote_map[main_instrument].Bid_Price1) + float(quote_map[main_instrument].Ask_Price1)) / 2 - \
                                       (float(quote_map[second_instrument].Bid_Price1) + float(quote_map[second_instrument].Ask_Price1)) / 2
                line6 = "\t\t\t<sub_instrument>" + second_instrument + "</sub_instrument>\n"
                line7 = "\t\t\t<open_order_volume>" + str(
                    int(math.ceil(float(open_order_volume) /
                                  5))) + "</open_order_volume>\n"
                line8 = "\t\t\t<tigger_ref_spread_price>" + str(
                    trigger_ref_spread_price) + "</tigger_ref_spread_price>\n"
                line9 = "\t\t\t<tigger_price_range>" + str(
                    int(trigger_price_range)) + "</tigger_price_range>\n"
                line91 = "\t\t\t<price_multiple>" + str(
                    float(tick) * 10) + "</price_multiple>\n"
                line93 = "\t\t\t<max_open_order_volume>" + str(
                    int(max_open_order_volume)) + "</max_open_order_volume>\n"
                line_add1 = "\t\t\t<open_over_take>" + str(
                    float(tick) * 7) + "</open_over_take>\n"
                print>> reach_limit_price_result_file, line3, line4, line6, line7, line71, line8, line9, line91, line_add1, line_add2, line92, \
                    line93, line_add3, line_add4, line_add5, line_add6, line_add7, line_add8, line_add9, line_add10,line_add11, line10
                reach_limit_price_result_file.flush()
Example #33
0
    def test_value_counts_inferred(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)
            expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(s.value_counts(), expected)
            
            self.assert_numpy_array_equal(s.unique(), np.unique(s_values))
            self.assertEquals(s.nunique(), 4)
            # don't sort, have to sort after the fact as not sorting is platform-dep
            hist = s.value_counts(sort=False)
            hist.sort()
            expected = Series([3, 1, 4, 2], index=list('acbd'))
            expected.sort()
            tm.assert_series_equal(hist, expected)

            # sort ascending
            hist = s.value_counts(ascending=True)
            expected = Series([1, 2, 3, 4], index=list('cdab'))
            tm.assert_series_equal(hist, expected)

            # relative histogram.
            hist = s.value_counts(normalize=True)
            expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(hist, expected)

            # bins
            self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({0.998: 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({0.998: 1.0})
            tm.assert_series_equal(res1n, exp1n)

            self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3]))
            self.assertEquals(s1.nunique(), 3)

            res4 = s1.value_counts(bins=4)
            exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4, exp4)
            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b']
            s = klass(s_values)
            expected = Series([4, 3, 2], index=['b', 'a', 'd'])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O'))
            self.assertEquals(s.nunique(), 3)

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected)
            self.assert_numpy_array_equal(s.unique(), np.array([]))
            self.assertEquals(s.nunique(), 0)

            # GH 3002, datetime64[ns]
            txt = "\n".join(['xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG',
                             'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM'])
            f = StringIO(txt)
            df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"],
                             parse_dates=["dt"])

            s = klass(df['dt'].copy())

            idx = pd.to_datetime(['2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X'])
            expected_s = Series([3, 2, 1], index=idx)
            tm.assert_series_equal(s.value_counts(), expected_s)

            expected = np.array(['2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z'],
                                dtype='datetime64[ns]')
            if isinstance(s, DatetimeIndex):
                expected = DatetimeIndex(expected)
                self.assert_(s.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(s.unique(), expected)

            self.assertEquals(s.nunique(), 3)

            # with NaT
            s = df['dt'].copy()
            s = klass([v for v in s.values] + [pd.NaT])

            result = s.value_counts()
            self.assertEqual(result.index.dtype, 'datetime64[ns]')
            expected_s[pd.NaT] = 1
            tm.assert_series_equal(result, expected_s)

            unique = s.unique()
            self.assertEqual(unique.dtype, 'datetime64[ns]')
            # numpy_array_equal cannot compare pd.NaT
            self.assert_numpy_array_equal(unique[:3], expected)
            self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT)

            self.assertEquals(s.nunique(), 4)

            # timedelta64[ns]
            td = df.dt - df.dt + timedelta(1)
            td = klass(td)

            result = td.value_counts()
            expected_s = Series([6], index=[86400000000000])
            self.assertEqual(result.index.dtype, 'int64')
            tm.assert_series_equal(result, expected_s)

            # get nanoseconds to compare
            expected = np.array([86400000000000])
            self.assert_numpy_array_equal(td.unique(), expected)
            self.assertEquals(td.nunique(), 1)

            td2 = timedelta(1) + (df.dt - df.dt)
            td2 = klass(td2)
            result2 = td2.value_counts()

            self.assertEqual(result2.index.dtype, 'int64')
            tm.assert_series_equal(result2, expected_s)

            self.assert_numpy_array_equal(td.unique(), expected)
            self.assertEquals(td.nunique(), 1)
    def setCategories(self, Categories: Series):
        Categories = Categories.unique()
        Categories.sort()

        self.__Encoder = Encoder()
        self.__Encoder.fit(Categories)