def testSeriesNested(self): s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6,7,8,9,10,15]) s.sort() nested = {'s1': s, 's2': s.copy()} exp = {'s1': ujson.decode(ujson.encode(s)), 's2': ujson.decode(ujson.encode(s))} self.assertTrue(ujson.decode(ujson.encode(nested)) == exp) exp = {'s1': ujson.decode(ujson.encode(s, orient="split")), 's2': ujson.decode(ujson.encode(s, orient="split"))} self.assertTrue(ujson.decode(ujson.encode(nested, orient="split")) == exp) exp = {'s1': ujson.decode(ujson.encode(s, orient="records")), 's2': ujson.decode(ujson.encode(s, orient="records"))} self.assertTrue(ujson.decode(ujson.encode(nested, orient="records")) == exp) exp = {'s1': ujson.decode(ujson.encode(s, orient="values")), 's2': ujson.decode(ujson.encode(s, orient="values"))} self.assertTrue(ujson.decode(ujson.encode(nested, orient="values")) == exp) exp = {'s1': ujson.decode(ujson.encode(s, orient="index")), 's2': ujson.decode(ujson.encode(s, orient="index"))} self.assertTrue(ujson.decode(ujson.encode(nested, orient="index")) == exp)
def hamm_entropy_kNN(X,y,kNN=3): d1 = hamm_entropy_Xy(X,y) d1 = Series(d1) d1.sort(ascending=True) ixs = d1.index d1_kNN = d1[0:kNN] ixs_kNN = ixs[0:kNN] return (np.array(d1_kNN), ixs_kNN)
def Main(): client = github_helpers.authenticate() keywords = raw_input("Please, enter keywords to search repositories: ") if keywords is '': keywords = 'javascript' print 'No keywords provided. It will use the keyword: ' + keywords search = client.search_repositories(keywords) first_page = search.get_page(0) languages = Series(r.language for r in first_page) languages = languages.dropna() languages.sort() percentages = (100.0 * languages.value_counts() / len(languages)).map( '{:,.2f} %'.format) print 'Languages percentage:' print percentages # Create plot x = [int(r.stargazers_count) for r in first_page] y = [int(r.forks) for r in first_page] # Add one to every value for logarithmic scale x = [val + 1 for val in x] y = [val + 1 for val in y] area = [100 for r in first_page] names = [r.name for r in first_page] colors = np.random.rand(len(first_page)) pl.scatter(x, y, s=area, c=colors, alpha=0.5) for i in range(0, len(x)): pl.annotate(names[i], (x[i], y[i]), fontsize=2) pl.title("All values are with addition of 1 (for the logarithmic scale)") pl.xlabel("Stars") pl.xscale("log") pl.yscale("log") pl.ylabel("Forks") pl.tight_layout() filepath = 'reports/APIs/github' if not os.path.isdir(filepath): os.makedirs(filepath) filepath += '/search_repositories.png' pl.savefig(filepath, figsize=(1020, 1020), dpi=300) pl.close() print( 'A chart with high resolution and small font size (to minimize overlaps) was created at ' + filepath)
def test_ix_align(self): from pandas import Series b = Series(np.random.randn(10)) b.sort() df_orig = Panel(np.random.randn(3, 10, 2)) df = df_orig.copy() df.ix[0, :, 0] = b assert_series_equal(df.ix[0, :, 0].reindex(b.index), b) df = df_orig.swapaxes(0, 1) df.ix[:, 0, 0] = b assert_series_equal(df.ix[:, 0, 0].reindex(b.index), b) df = df_orig.swapaxes(1, 2) df.ix[0, 0, :] = b assert_series_equal(df.ix[0, 0, :].reindex(b.index), b)
def main(): import shutil import tempfile import warnings from pandas import Series from vbench.api import BenchmarkRunner from suite import (REPO_PATH, BUILD, DB_PATH, PREPARE, dependencies, benchmarks) from memory_profiler import memory_usage warnings.filterwarnings('ignore', category=FutureWarning) try: TMP_DIR = tempfile.mkdtemp() runner = BenchmarkRunner( benchmarks, REPO_PATH, REPO_PATH, BUILD, DB_PATH, TMP_DIR, PREPARE, always_clean=True, # run_option='eod', start_date=START_DATE, module_dependencies=dependencies) results = {} for b in runner.benchmarks: k = b.name try: vs = memory_usage((b.run, )) v = max(vs) # print(k, v) results[k] = v except Exception as e: print("Exception caught in %s\n" % k) print(str(e)) s = Series(results) s.sort() print(s) finally: shutil.rmtree(TMP_DIR)
def Main(): client = github_helpers.authenticate() keywords = raw_input("Please, enter keywords to search repositories: ") if keywords is '': keywords = 'javascript' print 'No keywords provided. It will use the keyword: ' + keywords search = client.search_repositories(keywords) first_page = search.get_page(0) languages = Series(r.language for r in first_page) languages = languages.dropna() languages.sort() percentages = (100.0 * languages.value_counts() / len(languages)).map('{:,.2f} %'.format) print 'Languages percentage:' print percentages # Create plot x = [int(r.stargazers_count) for r in first_page] y = [int(r.forks) for r in first_page] # Add one to every value for logarithmic scale x = [val + 1 for val in x] y = [val + 1 for val in y] area = [100 for r in first_page] names = [r.name for r in first_page] colors = np.random.rand(len(first_page)) pl.scatter(x, y, s=area, c=colors, alpha=0.5) for i in range(0, len(x)): pl.annotate(names[i], (x[i], y[i]), fontsize=2) pl.title("All values are with addition of 1 (for the logarithmic scale)") pl.xlabel("Stars") pl.xscale("log") pl.yscale("log") pl.ylabel("Forks") pl.tight_layout() filepath = 'reports/APIs/github' if not os.path.isdir(filepath): os.makedirs(filepath) filepath += '/search_repositories.png' pl.savefig(filepath, figsize=(1020, 1020), dpi=300) pl.close() print('A chart with high resolution and small font size (to minimize overlaps) was created at ' + filepath)
def get_same_cluster_articles(self, user_id, label, topn=3): log_collection = self.conn.get_collection('article_read_log') ls = log_collection.group({'article_id': True}, { 'label': str(label), 'user_id': { '$ne': user_id } }, {'count': 0}, 'function(obj, prev) {prev.count++}') ls_conv = {'article_id': [], 'count': []} for item in ls: ls_conv['article_id'].append(item['article_id']) ls_conv['count'].append(item['count']) s = Series(index=ls_conv['article_id'], data=ls_conv['count']) s.sort(ascending=False) # sorting return s.keys()[:topn]
def get_same_cluster_articles(self, user_id, label, topn=3): log_collection = self.conn.get_collection('article_read_log') ls = log_collection.group( {'article_id': True}, {'label' : str(label), 'user_id' : {'$ne' : user_id}}, {'count' : 0}, 'function(obj, prev) {prev.count++}' ) ls_conv = {'article_id' :[], 'count': []} for item in ls: ls_conv['article_id'].append(item['article_id']) ls_conv['count'].append(item['count']) s = Series(index=ls_conv['article_id'], data=ls_conv['count']) s.sort(ascending=False) # sorting return s.keys()[:topn]
def testSeries(self): s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6,7,8,9,10,15]) s.sort() # column indexed outp = Series(ujson.decode(ujson.encode(s))) outp.sort() self.assertTrue((s == outp).values.all()) outp = Series(ujson.decode(ujson.encode(s), numpy=True)) outp.sort() self.assertTrue((s == outp).values.all()) dec = _clean_dict(ujson.decode(ujson.encode(s, orient="split"))) outp = Series(**dec) self.assertTrue((s == outp).values.all()) self.assertTrue(s.name == outp.name) dec = _clean_dict(ujson.decode(ujson.encode(s, orient="split"), numpy=True)) outp = Series(**dec) self.assertTrue((s == outp).values.all()) self.assertTrue(s.name == outp.name) outp = Series(ujson.decode(ujson.encode(s, orient="records"), numpy=True)) self.assertTrue((s == outp).values.all()) outp = Series(ujson.decode(ujson.encode(s, orient="records"))) self.assertTrue((s == outp).values.all()) outp = Series(ujson.decode(ujson.encode(s, orient="values"), numpy=True)) self.assertTrue((s == outp).values.all()) outp = Series(ujson.decode(ujson.encode(s, orient="values"))) self.assertTrue((s == outp).values.all()) outp = Series(ujson.decode(ujson.encode(s, orient="index"))) outp.sort() self.assertTrue((s == outp).values.all()) outp = Series(ujson.decode(ujson.encode(s, orient="index"), numpy=True)) outp.sort() self.assertTrue((s == outp).values.all())
def main(): import shutil import tempfile import warnings from pandas import Series from vbench.api import BenchmarkRunner from suite import (REPO_PATH, BUILD, DB_PATH, PREPARE, dependencies, benchmarks) from memory_profiler import memory_usage warnings.filterwarnings('ignore', category=FutureWarning) try: TMP_DIR = tempfile.mkdtemp() runner = BenchmarkRunner( benchmarks, REPO_PATH, REPO_PATH, BUILD, DB_PATH, TMP_DIR, PREPARE, always_clean=True, # run_option='eod', start_date=START_DATE, module_dependencies=dependencies) results = {} for b in runner.benchmarks: k = b.name try: vs = memory_usage((b.run,)) v = max(vs) # print(k, v) results[k] = v except Exception as e: print("Exception caught in %s\n" % k) print(str(e)) s = Series(results) s.sort() print(s) finally: shutil.rmtree(TMP_DIR)
def visualize_tree(tree, feature_names, labelnames, filename): """Create tree png using graphviz. Args ---- tree -- scikit-learn DecsisionTree. feature_names -- list of feature names. """ labels = Series(labelnames.values.ravel()).unique() labels.sort() labels = map(str, labels) # labels = labelnames.unique() # print labels with open(filename + ".dot", 'w') as f: export_graphviz(tree.dt, out_file=f, feature_names=feature_names, class_names=labels) command = ["dot", "-Tpdf", filename + ".dot", "-o", filename + ".pdf"] try: subprocess.check_call(command) except: exit("Could not run dot, ie graphviz, to " "produce visualization")
nonalphabet = re.compile("[^a-z]") with open('words.txt', 'r') as f: lines = f.readlines() trigrams = {} for line in lines: trigram = line.strip().lower()[0:3] if len(trigram) >= 3 and not nonalphabet.search(trigram): if trigram == "aaa": print "line: {0} trigram: {1}".format(line, trigram) trigrams.setdefault(trigram, 0) trigrams[trigram] += 1 trigram_series = Series(trigrams.values(), index=trigrams.keys()) trigram_series.sort(inplace=True, ascending=True) print trigram_series print "quartiles:\n{0}".format(trigram_series.quantile([.25, .50, .75, .99]).to_string()) print "median is: {0}".format(trigram_series.median()) unique_trigrams = [] for trigram, count in trigrams.iteritems(): if count > trigram_series.quantile(.50): unique_trigrams.append(trigram) unique_trigrams.append(trigram) print "saving trigrams" with open("trigrams.json", "w") as f: json.dump(unique_trigrams, f) print "saved {0} trigrams".format(len(unique_trigrams))
import matplotlib.pyplot as pl ind = np.arange(len(egs)) fig = pl.figure(1, figsize=(9, 4)) ax = fig.add_subplot(111) ax.bar(ind,egs) pl.xticks(ind, ixs, rotation=90) ax.set_title('Entropy Gain') # Using a tree for feature importance from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier(max_depth=5) clf.fit(X_train, y_train) z_tmp = Series(clf.feature_importances_, X_all_aug.columns) z_tmp.sort() ixs = z_tmp.index ind = np.arange(len(e_gain)) pl.figure(figsize=(9, 4)) pl.bar(ind,e_gain) pl.xticks(ind, ixs, rotation=90) df = DataFrame({'e_gain': e_gain, 'import': z_tmp}) df = df.sort('e_gain', ascending=False) ind = np.arange(df.shape[0]) pl.figure(figsize=(9, 4)) pl.bar(ind,df.values[:,0],df.values[:,1]) pl.xticks(ind, df.index, rotation=90) # Not great plot
all_games = pd.read_csv('Data/Box_Scores/' + year + '/Game_List_' + year + '.csv', dtype={'Away_PTS': np.object, 'Home_PTS': np.object}) all_games['Date'] = pd.to_datetime(all_games['Date']) DateAsOf = max(all_games['Date'].loc[(all_games['Home_PTS']!=' ')]) print ("Date As Of: " + str(DateAsOf.date())) all_games['Home_PTS'] = pd.to_numeric(all_games['Home_PTS'], errors='coerce') all_games['Away_PTS'] = pd.to_numeric(all_games['Away_PTS'], errors='coerce') all_games = all_games.loc[all_games['Away_PTS'].isnull()==False] all_games = all_games.rename(columns={'Home': 'HOME', 'Home_PTS': 'HOME_PTS', 'Away': 'AWAY', 'Away_PTS': 'AWAY_PTS'}) teams = list(all_games['HOME'].value_counts().keys()) teams.sort() dates = Series(all_games['Date'].value_counts().keys()) dates = list(dates[dates>start_date]) dates.sort() def calc_sos(team_df, MOV_df, BaseRate): teams = list(MOV_df['Team']) SOS = [0 for team in teams] init_rate = MOV_df[BaseRate] MOV_df = MOV_df.rename(columns={'Team':'VS_Team', BaseRate: 'VS_' + BaseRate}) for (i, team) in enumerate(teams): team_mov = pd.merge(team_df[i], MOV_df[['VS_Team', 'VS_' + BaseRate]], how='left', on='VS_Team') SOS[i] = np.mean(team_mov['VS_' + BaseRate]) MOV_df = MOV_df.rename(columns={'VS_Team':'Team', 'VS_' + BaseRate:BaseRate}) MOV_df['SOS'] = SOS - np.mean(SOS) MOV_df['SRS'] = MOV_df['MOV'] + MOV_df['SOS'] delta = np.mean(np.absolute(MOV_df['SRS']-init_rate))
print(s['a'], end='') print(' which is same as ', end='') print(s.a) # A lot of Numpy functions also accept Series as arguments without problems # Some useful functions # Series.median(axis=None, skipna=None, level=None, numeric_only=None, **kwargs), # axis is either 0 or 1. axis = 0 means 'column-wise' and axis = 1 means 'row-wise' # as of nowm, we know that a series is a 1D array, so only axis = 0 makes-sense # if you put axis = 1, then you get an error as it doesnt make sense for a 1D array # Check for yourself. # skipna = skip NA/null values. If everything in the series is NA, return NA # level = dont know yet # numeric_only = currently not implemented s.sort() print(s) print( s.median(axis=0) ) # median means first sort , then middle element. For even no of entries, average of two items in the middle. #print(s.median(axis=1)) # Comparing whole list with a scalar l = ( s > s.median() ) # this compares every item in the list with the scalar and returns a list of bool print(l) print( s[l] ) ## This will print only those items where the corresponding item in list is True. So elegant . Remember it.. :-)
def get_hist_data(alist,limit=10): """make a freq series from list using pandas, limit to most freq 10""" genus_freq = Series(alist).value_counts() genus_freq.sort(ascending=False) #biggest to smallest return genus_freq[:limit]
from pandas import Series import re import matplotlib import matplotlib.pyplot as plt from nltk.sentiment.vader import SentimentIntensityAnalyzer from nltk import tokenize s1 = Series.from_csv('/Users/cprinz/Developer/MIS375_TwitterProject/fakenews_2-25.csv') s2 = Series.from_csv('/Users/cprinz/Developer/MIS375_TwitterProject/fakenews_2-26.csv') s3 = Series.from_csv('/Users/cprinz/Developer/MIS375_TwitterProject/fakenews_2-27.csv') s4 = Series.from_csv('/Users/cprinz/Developer/MIS375_TwitterProject/fakenews_2-28.csv') all_tweets = pd.concat([s1,s2,s3,s4]) twitter_handle_re = re.compile(r'@([A-Za-z0-9_]+)') mention_counts = Series() for item in all_tweets: mentions = twitter_handle_re.findall(item) for mention in mentions: if mention in mention_counts.keys(): mention_counts[mention] += 1 else: mention_counts[mention] = 1 mention_counts.sort(ascending = False) #print mention_counts mention_counts.plot()
def summarycalc(projid,date=None,state=0): proj=web.ctx.orm.query(Project).filter_by(projid=projid).first() if proj is None: return None symbolsin=proj.symbol causesin = proj.cause summary_dict={} for symbol in symbolsin: symbolid=symbol.symbolid resultid=str(projid)+'_'+str(symbolid) allresult=[] totalresult=list(web.ctx.orm.query(Result).filter_by(resultid=resultid).all()) if totalresult==[]: return None if state ==0: date=totalresult[0].origintime.strftime('%Y-%m-%d') #datestr=totalresult[0].origintime.strftime('%Y-%m-%d') #date=time.strptime(datestr,'%Y-%m-%d') #date=datetime.datetime(*totalresult[0].origintime[:6]) #allresult=[item for item in totalresult if func.date_format(item.origintime,'%Y-%m-%d')==date] allresult=[item for item in totalresult if item.origintime.strftime('%Y-%m-%d')==date] if allresult == []: return None symbolresultdict={} cppredict_dict={} ncppredict_dict={} for cause in causesin: causeid=cause.causeid if cause.causename in ['cause4','cause5','cause8','cause9']: lcausevalue=json.loads(cause.causevalue) xdayslist=lcausevalue[cause.causename] for xday in xdayslist: cpname='cp for '+xday+' '+gcausedict[cause.causename] ncpname='ncp for '+xday+' '+gcausedict[cause.causename] predicter_cp=web.ctx.orm.query(Predict).filter_by(causeid=causeid,resultname=cpname).first() predicter_ncp=web.ctx.orm.query(Predict).filter_by(causeid=causeid,resultname=ncpname).first() cppredict_dict[cpname]=predicter_cp.coefficient ncppredict_dict[ncpname]=predicter_ncp.coefficient else: cpname='cp for '+gcausedict[cause.causename] ncpname='ncp for '+gcausedict[cause.causename] predicter_cp=web.ctx.orm.query(Predict).filter_by(causeid=causeid,resultname=cpname).first() predicter_ncp=web.ctx.orm.query(Predict).filter_by(causeid=causeid,resultname=ncpname).first() cppredict_dict[cpname]=predicter_cp.coefficient ncppredict_dict[ncpname]=predicter_ncp.coefficient cppredict_ser=Series(cppredict_dict,index=cppredict_dict.keys()) ncppredict_ser=Series(ncppredict_dict,index=ncppredict_dict.keys()) cppredict_ser.sort() ncppredict_ser.sort() if len(cppredict_ser)<3: realcp_ser=cppredict_ser else: realcp_ser=cppredict_ser[-3:] if len(ncppredict_ser)<3: realncp_ser=ncppredict_ser else: realncp_ser=ncppredict_ser[-3:] symbol_summary_dict={} for idx in realcp_ser.index: symbol_summary_dict[idx]=[process(item.resultvalue) for item in allresult if item.resultname==idx][0] for idx in realncp_ser.index: symbol_summary_dict[idx]=[process(item.resultvalue) for item in allresult if item.resultname==idx][0] summary_dict[symbol]=symbol_summary_dict summary_df=DataFrame(summary_dict) return summary_df.T
nonalphabet = re.compile("[^a-z]") with open('words.txt', 'r') as f: lines = f.readlines() trigrams = {} for line in lines: trigram = line.strip().lower()[0:3] if len(trigram) >= 3 and not nonalphabet.search(trigram): if trigram == "aaa": print "line: {0} trigram: {1}".format(line, trigram) trigrams.setdefault(trigram, 0) trigrams[trigram] += 1 trigram_series = Series(trigrams.values(), index=trigrams.keys()) trigram_series.sort(inplace=True, ascending=True) print trigram_series print "quartiles:\n{0}".format( trigram_series.quantile([.25, .50, .75, .99]).to_string()) print "median is: {0}".format(trigram_series.median()) unique_trigrams = [] for trigram, count in trigrams.iteritems(): if count > trigram_series.quantile(.50): unique_trigrams.append(trigram) unique_trigrams.append(trigram) print "saving trigrams" with open("trigrams.json", "w") as f: json.dump(unique_trigrams, f) print "saved {0} trigrams".format(len(unique_trigrams))
def interrogator( corpus, search, query="any", show="w", exclude=False, excludemode="any", searchmode="all", dep_type="collapsed-ccprocessed-dependencies", case_sensitive=False, quicksave=False, just_speakers=False, preserve_case=False, lemmatag=False, files_as_subcorpora=False, conc=False, only_unique=False, random=False, only_format_match=False, multiprocess=False, spelling=False, regex_nonword_filter=r"[A-Za-z0-9:_]", gramsize=2, split_contractions=False, **kwargs ): """interrogate corpus, corpora, subcorpus and file objects see corpkit.interrogation.interrogate() for docstring""" # store kwargs locs = locals() from corpkit.interrogation import Interrogation from corpkit.process import tregex_engine import pandas as pd from pandas import DataFrame, Series from collections import Counter from corpkit.other import as_regex from corpkit.process import get_deps from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) from corpkit.textprogressbar import TextProgressBar from corpkit.process import animator from corpkit.dictionaries.word_transforms import wordlist, taglemma # find out if using gui root = kwargs.get("root") note = kwargs.get("note") # convert path to corpus object if type(corpus) == str: from corpkit.corpus import Corpus corpus = Corpus(corpus) # figure out how the user has entered the query and normalise from corpkit.process import searchfixer search, search_iterable = searchfixer(search, query) # for better printing of query, esp during multiprocess # can remove if multiprocess printing improved if len(search.keys()) == 1: query = search.values()[0] if "l" in show and search.get("t"): from nltk.stem.wordnet import WordNetLemmatizer lmtzr = WordNetLemmatizer() if type(show) == str: show = [show] def is_multiquery(corpus, search, query, just_speakers): """determine if multiprocessing is needed do some retyping if need be as well""" im = False from collections import OrderedDict if hasattr(corpus, "__iter__"): im = True # so we can do search = 't', query = ['NP', 'VP']: if type(query) == list: if query != search.values()[0] or len(search.keys()) > 1: query = {c.title(): c for c in query} if type(query) == dict or type(query) == OrderedDict: im = True if just_speakers: if just_speakers == "each": im = True just_speakers = ["each"] if just_speakers == ["each"]: im = True if type(just_speakers) == str: im = False just_speakers = [just_speakers] if type(just_speakers) == list: if len(just_speakers) > 1: im = True if type(search) == dict: if all(type(i) == dict for i in search.values()): im = True return im, corpus, search, query, just_speakers def slow_tregex(sents, **dummy_args): """do the speaker-specific version of tregex queries""" import os from corpkit.process import tregex_engine # first, put the relevant trees into temp file if kwargs.get("outname"): to_open = "tmp-%s.txt" % kwargs["outname"] else: to_open = "tmp.txt" to_write = "\n".join([sent._parse_string.strip() for sent in sents if sent.parse_string is not None]) to_write.encode("utf-8", errors="ignore") with open(to_open, "w") as fo: fo.write(to_write) q = search.values()[0] res = tregex_engine( query=q, options=["-o", "-%s" % translated_option], corpus=to_open, root=root, preserve_case=True ) if root: root.update() os.remove(to_open) if countmode: return len(res) else: return res def get_stats(sents, **dummy_args): """get a bunch of frequencies on interpersonal phenomena""" import os import re from collections import Counter statsmode_results = Counter() # first, put the relevant trees into temp file if kwargs.get("outname"): to_open = "tmp-%s.txt" % kwargs["outname"] else: to_open = "tmp.txt" with open(to_open, "w") as fo: for sent in sents: statsmode_results["Sentences"] += 1 sts = sent.parse_string.rstrip() encd = sts.encode("utf-8", errors="ignore") + "\n" fo.write(encd) deps = get_deps(sent, dep_type) numpass = len([x for x in deps.links if x.type.endswith("pass")]) statsmode_results["Passives"] += numpass statsmode_results["Tokens"] += len(sent.tokens) words = [w.word for w in sent.tokens if w.word.isalnum()] statsmode_results["Words"] += len(words) statsmode_results["Characters"] += len("".join(words)) # count moods via trees (/\?/ !< __) from dictionaries.process_types import processes from corpkit.other import as_regex tregex_qs = { "Imperative": r"ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/", "Open interrogative": r"ROOT < SBARQ <<- (/\?/ !< __)", "Closed interrogative": r"ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))", "Unmodalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))", "Modalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))", "Open class words": r"/^(NN|JJ|VB|RB)/ < __", "Closed class words": r"__ !< __ !> /^(NN|JJ|VB|RB)/", "Clauses": r"/^S/ < __", "Interrogative": r"ROOT << (/\?/ !< __)", "Mental processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.mental, boundaries="w"), "Verbal processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.verbal, boundaries="w"), "Relational processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.relational, boundaries="w"), } for name, q in sorted(tregex_qs.items()): res = tregex_engine(query=q, options=["-o", "-C"], corpus=to_open, root=root) statsmode_results[name] += int(res) global numdone numdone += 1 if root: root.update() else: tot_string = str(numdone + 1) + "/" + str(total_files) if kwargs.get("outname"): tot_string = "%s: %s" % (kwargs["outname"], tot_string) animator(p, numdone, tot_string, **par_args) if kwargs.get("note", False): kwargs["note"].progvar.set((numdone * 100.0 / total_files / denom) + startnum) os.remove(to_open) return statsmode_results def make_conc_lines_from_whole_mid(wholes, middle_column_result, speakr=False): if speakr is False: speakr = "" conc_lines = [] # remove duplicates from results unique_wholes = [] unique_middle_column_result = [] duplicates = [] for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)): if "-join-".join([f, whole, mid]) not in duplicates: duplicates.append("-join-".join([f, whole, mid])) unique_wholes.append([f, whole]) unique_middle_column_result.append(mid) # split into start, middle and end, dealing with multiple occurrences for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)): reg = re.compile(r"([^a-zA-Z0-9-]|^)(" + re.escape(mid) + r")([^a-zA-Z0-9-]|$)", re.IGNORECASE | re.UNICODE) offsets = [(m.start(), m.end()) for m in re.finditer(reg, whole)] for offstart, offend in offsets: start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip() conc_lines.append([os.path.basename(f), speakr, start, middle, end]) return conc_lines def uniquify(conc_lines): from collections import OrderedDict unique_lines = [] checking = [] for index, (f, speakr, start, middle, end) in enumerate(conc_lines): joined = " ".join([speakr, start, "MIDDLEHERE:", middle, ":MIDDLEHERE", end]) if joined not in checking: unique_lines.append(conc_lines[index]) checking.append(joined) return unique_lines def lemmatiser(list_of_words, tag): """take a list of unicode words and a tag and return a lemmatised list.""" output = [] for word in list_of_words: if translated_option.startswith("u"): if word.lower() in taglemma.keys(): word = taglemma[word.lower()] else: if word == "x": word = "Other" # only use wordnet lemmatiser when appropriate else: if word in wordlist: word = wordlist[word] word = lmtzr.lemmatize(word, tag) output.append(word) return output def gettag(query, lemmatag=False): """ Find tag for WordNet lemmatisation """ import re tagdict = {"N": "n", "A": "a", "V": "v", "A": "r", "None": False, "": False, "Off": False} if lemmatag is False: tag = "n" # same default as wordnet # attempt to find tag from tregex query tagfinder = re.compile(r"^[^A-Za-z]*([A-Za-z]*)") tagchecker = re.compile(r"^[A-Z]{1,4}$") qr = query.replace(r"\w", "").replace(r"\s", "").replace(r"\b", "") treebank_tag = re.findall(tagfinder, qr) if re.match(tagchecker, treebank_tag[0]): tag = tagdict.get(treebank_tag[0], "n") elif lemmatag: tag = lemmatag return tag def format_tregex(results): """format tregex by show list""" if countmode: return results import re done = [] if "l" in show or "pl" in show: lemmata = lemmatiser(results, gettag(search.get("t"), lemmatag)) else: lemmata = [None for i in results] for word, lemma in zip(results, lemmata): bits = [] if exclude and exclude.get("w"): if len(exclude.keys()) == 1 or excludemode == "any": if re.search(exclude.get("w"), word): continue if len(exclude.keys()) == 1 or excludemode == "any": if re.search(exclude.get("l"), lemma): continue if len(exclude.keys()) == 1 or excludemode == "any": if re.search(exclude.get("p"), word): continue if len(exclude.keys()) == 1 or excludemode == "any": if re.search(exclude.get("pl"), lemma): continue if exclude and excludemode == "all": num_to_cause_exclude = len(exclude.keys()) current_num = 0 if exclude.get("w"): if re.search(exclude.get("w"), word): current_num += 1 if exclude.get("l"): if re.search(exclude.get("l"), lemma): current_num += 1 if exclude.get("p"): if re.search(exclude.get("p"), word): current_num += 1 if exclude.get("pl"): if re.search(exclude.get("pl"), lemma): current_num += 1 if current_num == num_to_cause_exclude: continue for i in show: if i == "t": bits.append(word) if i == "l": bits.append(lemma) elif i == "w": bits.append(word) elif i == "p": bits.append(word) elif i == "pl": bits.append(lemma) joined = "/".join(bits) done.append(joined) return done def tok_by_list(pattern, list_of_toks, concordancing=False, **kwargs): """search for regex in plaintext corpora""" import re if type(pattern) == str: pattern = [pattern] if not case_sensitive: pattern = [p.lower() for p in pattern] if not concordancing: if case_sensitive: matches = [m for m in list_of_toks if m in pattern] else: matches = [m for m in list_of_toks if m.lower() in pattern] else: matches = [] for index, token in enumerate(list_of_toks): if token in pattern: match = [" ".join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(token) match.append(" ".join([t for t in unsplitter(list_of_toks[index + 1 :])])[:140]) matches.append(match) if countmode: return len(matches) else: return matches def unsplitter(lst): """unsplit contractions and apostophes from tokenised text""" if split_contractions: return lst unsplit = [] for index, t in enumerate(lst): if index == 0 or index == len(lst) - 1: unsplit.append(t) continue if "'" in t and not t.endswith("'"): rejoined = "".join([lst[index - 1], t]) unsplit.append(rejoined) else: if not "'" in lst[index + 1]: unsplit.append(t) return unsplit def tok_ngrams(pattern, list_of_toks, concordancing=False, split_contractions=True): from collections import Counter import re ngrams = Counter() result = [] # if it's not a compiled regex list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)] if pattern.lower() == "any": pattern = r".*" if not split_contractions: list_of_toks = unsplitter(list_of_toks) # list_of_toks = [x for x in list_of_toks if "'" not in x] for index, w in enumerate(list_of_toks): try: the_gram = [list_of_toks[index + x] for x in range(gramsize)] if not any(re.search(pattern, x) for x in the_gram): continue ngrams[" ".join(the_gram)] += 1 except IndexError: pass # turn counter into list of results for k, v in ngrams.items(): if v > 1: for i in range(v): result.append(k) if countmode: return len(result) else: return result def compiler(pattern): """compile regex or fail gracefully""" import re try: if case_sensitive: comped = re.compile(pattern) else: comped = re.compile(pattern, re.IGNORECASE) return comped except: import traceback import sys from time import localtime, strftime exc_type, exc_value, exc_traceback = sys.exc_info() lst = traceback.format_exception(exc_type, exc_value, exc_traceback) error_message = lst[-1] thetime = strftime("%H:%M:%S", localtime()) print "%s: Query %s" % (thetime, error_message) if root: return "Bad query" else: raise ValueError("%s: Query %s" % (thetime, error_message)) def tok_by_reg(pattern, list_of_toks, concordancing=False, **kwargs): """search for regex in plaintext corpora""" import re comped = compiler(pattern) if comped == "Bad query": return "Bad query" if not concordancing: matches = [m for m in list_of_toks if re.search(comped, m)] else: matches = [] for index, token in enumerate(list_of_toks): if re.search(comped, token): match = [" ".join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(re.search(comped, token).group(0)) match.append(" ".join([t for t in unsplitter(list_of_toks[index + 1 :])])[:140]) matches.append(match) if countmode: return len(matches) else: return matches def plaintext_regex_search(pattern, plaintext_data, concordancing=False, **kwargs): """search for regex in plaintext corpora it searches over lines, so the user needs to be careful. """ import re if concordancing: pattern = r"(.{,140})\b(" + pattern + r")\b(.{,140})" compiled_pattern = compiler(pattern) if compiled_pattern == "Bad query": return "Bad query" matches = re.findall(compiled_pattern, plaintext_data) if concordancing: matches = [list(m) for m in matches] if not concordancing: for index, i in enumerate(matches): if type(i) == tuple: matches[index] = i[0] if countmode: return len(matches) else: return matches def correct_spelling(a_string): if not spelling: return a_string from dictionaries.word_transforms import usa_convert if spelling.lower() == "uk": usa_convert = {v: k for k, v in usa_convert.items()} spell_out = [] bits = a_string.split("/") for index, i in enumerate(bits): converted = usa_convert.get(i.lower(), i) if i.islower() or preserve_case is False: converted = converted.lower() elif i.isupper() and preserve_case: converted = converted.upper() elif i.istitle() and preserve_case: converted = converted.title() bits[index] = converted r = "/".join(bits) return r def plaintext_simple_search(pattern, plaintext_data, concordancing=False, **kwargs): """search for tokens in plaintext corpora""" import re result = [] if type(pattern) == str: pattern = [pattern] for p in pattern: if concordancing: pat = r"(.{0,140})\b(" + re.escape(p) + r")\b(.{0,140})" pat = compiler(pat) if pat == "Bad query": return "Bad query" matches = re.findall(pat, plaintext_data) if concordancing: matches = [list(m) for m in matches] for i in matches: result.append(i) else: for m in range(len(matches)): result.append(p) return result # do multiprocessing if need be im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers) locs["search"] = search locs["query"] = query locs["just_speakers"] = just_speakers locs["corpus"] = corpus locs["multiprocess"] = multiprocess if im: from corpkit.multiprocess import pmultiquery return pmultiquery(**locs) datatype = corpus.datatype singlefile = corpus.singlefile # store all results in here results = {} # check if just counting countmode = "c" in show # where we are at in interrogation current_iter = 0 # multiprocessing progress bar denom = kwargs.get("denominator", 1) startnum = kwargs.get("startnum", 0) ############################################ # Determine the search function to be used # ############################################ # simple tregex is tregex over whole dirs simple_tregex_mode = False statsmode = False if not just_speakers and "t" in search.keys(): simple_tregex_mode = True else: if corpus.datatype == "plaintext": if search.get("n"): raise NotImplementedError("Use a tokenised corpus for n-gramming.") # searcher = plaintext_ngram optiontext = "n-grams via plaintext" if search.get("w"): if kwargs.get("regex", True): searcher = plaintext_regex_search else: searcher = plaintext_simple_search optiontext = "Searching plaintext" elif corpus.datatype == "tokens": if search.get("n"): searcher = tok_ngrams optiontext = "n-grams via tokens" elif search.get("w"): if kwargs.get("regex", True): searcher = tok_by_reg else: searcher = tok_by_list if type(search.get("w")) == list: searcher = tok_by_list optiontext = "Searching tokens" only_parse = ["r", "d", "g", "dl", "gl", "df", "gf", "dp", "gp", "f"] if corpus.datatype != "parse" and any(i in only_parse for i in search.keys()): raise ValueError( 'Need parsed corpus to search with "%s" option(s).' % ", ".join([i for i in search.keys() if i in only_parse]) ) elif corpus.datatype == "parse": if search.get("t"): searcher = slow_tregex elif search.get("s"): searcher = get_stats statsmode = True optiontext = "General statistics" global numdone numdone = 0 else: from corpkit.depsearch import dep_searcher searcher = dep_searcher optiontext = "Dependency querying" ############################################ # Set some Tregex-related values # ############################################ if search.get("t"): query = search.get("t") # check the query q = tregex_engine(corpus=False, query=search.get("t"), options=["-t"], check_query=True, root=root) if query is False: if root: return "Bad query" else: return optiontext = "Searching parse trees" if "p" in show or "pl" in show: translated_option = "u" if type(search["t"]) == list: search["t"] = r"__ < (/%s/ !< __)" % as_regex( search["t"], boundaries="line", case_sensitive=case_sensitive ) if search["t"] == "any": search["t"] = r"__ < (/.?[A-Za-z0-9].?/ !< __)" elif "t" in show: translated_option = "o" if type(search["t"]) == list: search["t"] = r"__ < (/%s/ !< __)" % as_regex( search["t"], boundaries="line", case_sensitive=case_sensitive ) if search["t"] == "any": search["t"] = r"__ < (/.?[A-Za-z0-9].?/ !< __)" elif "w" in show: translated_option = "t" if type(search["t"]) == list: search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive) if search["t"] == "any": search["t"] = r"/.?[A-Za-z0-9].?/ !< __" elif "c" in show: count_results = {} only_count = True translated_option = "C" if type(search["t"]) == list: search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive) if search["t"] == "any": search["t"] = r"/.?[A-Za-z0-9].?/ !< __" elif "l" in show: translated_option = "t" if type(search["t"]) == list: search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive) if search["t"] == "any": search["t"] = r"/.?[A-Za-z0-9].?/ !< __" query = search["t"] ############################################ # Make iterable for corpus/subcorpus/file # ############################################ if corpus.singlefile: to_iterate_over = {(corpus.name, corpus.path): [corpus]} elif not corpus.subcorpora: to_iterate_over = {(corpus.name, corpus.path): corpus.files} else: to_iterate_over = {} for k, v in sorted(corpus.structure.items()): to_iterate_over[(k.name, k.path)] = v if files_as_subcorpora: to_iterate_over = {} for f in corpus.files: to_iterate_over[(f.name, f.path)] = [f] ############################################ # Print welcome message # ############################################ if conc: message = "Concordancing" else: message = "Interrogating" if kwargs.get("printstatus", True): thetime = strftime("%H:%M:%S", localtime()) sformat = "\n ".join(["%s: %s" % (k.rjust(3), v) for k, v in search.items()]) if search == {"s": r".*"}: sformat = "features" welcome = "\n%s: %s %s ...\n %s\n Query: %s\n" % ( thetime, message, corpus.name, optiontext, sformat, ) print welcome ############################################ # Make progress bar # ############################################ if simple_tregex_mode: total_files = len(to_iterate_over.keys()) else: if search.get("s"): total_files = sum([len(x) for x in to_iterate_over.values()]) * 12 else: total_files = sum([len(x) for x in to_iterate_over.values()]) par_args = {"printstatus": kwargs.get("printstatus", True), "root": root, "note": note, "length": total_files} term = None if kwargs.get("paralleling", None) is not None: from blessings import Terminal term = Terminal() par_args["terminal"] = term par_args["linenum"] = kwargs.get("paralleling") outn = kwargs.get("outname", "") if outn: outn = outn + ": " tstr = "%s%d/%d" % (outn, current_iter, total_files) p = animator(None, None, init=True, tot_string=tstr, **par_args) tstr = "%s%d/%d" % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) ############################################ # Iterate over data, doing interrogations # ############################################ for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()): if countmode or conc: results[subcorpus_name] = [] else: results[subcorpus_name] = Counter() # tregex over subcorpora, not files if simple_tregex_mode: op = ["-o", "-" + translated_option] result = tregex_engine( query=search["t"], options=op, corpus=subcorpus_path, root=root, preserve_case=preserve_case ) if countmode: results[subcorpus_name].append(result) continue result = Counter(format_tregex(result)) if conc: op.append("-w") whole_result = tregex_engine( query=search["t"], options=op, corpus=subcorpus_path, root=root, preserve_case=preserve_case ) if not only_format_match: whole_result = format_tregex(whole_result) result = make_conc_lines_from_whole_mid(whole_result, result, speakr=False) if spelling: for index, line in enumerate(result): result[index] = [correct_spelling(b) for b in line] results[subcorpus_name] += result current_iter += 1 if kwargs.get("paralleling", None) is not None: tstr = "%s%d/%d" % (outn, current_iter + 2, total_files) else: tstr = "%s%d/%d" % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) # dependencies, plaintext, tokens or slow_tregex else: for f in files: if corpus.datatype == "parse": with open(f.path, "r") as data: data = data.read() from corenlp_xml.document import Document try: corenlp_xml = Document(data) except: print "Could not read file: %s" % f.path continue if just_speakers: sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers] if not sents: continue else: sents = corenlp_xml.sentences res = searcher( sents, search=search, show=show, dep_type=dep_type, exclude=exclude, excludemode=excludemode, searchmode=searchmode, lemmatise=False, case_sensitive=case_sensitive, concordancing=conc, only_format_match=only_format_match, ) if res == "Bad query": return "Bad query" if searcher == slow_tregex and not countmode: res = format_tregex(res) elif corpus.datatype == "tokens": import pickle with open(f.path, "rb") as fo: data = pickle.load(fo) res = searcher(search.values()[0], data, split_contractions=split_contractions, concordancing=conc) if conc: for index, line in enumerate(res): line.insert(0, "") elif corpus.datatype == "plaintext": with open(f.path, "rb") as data: data = data.read() data = unicode(data, errors="ignore") res = searcher(search.values()[0], data, concordancing=conc) if conc: for index, line in enumerate(res): line.insert(0, "") if countmode: results[subcorpus_name] += res continue # add filename and do lowercasing for conc if conc: for index, line in enumerate(res): line.insert(0, f.name) if not preserve_case: line = [b.lower() for b in line] if spelling: line = [correct_spelling(b) for b in line] results[subcorpus_name] += [line] # do lowercasing and spelling else: if not preserve_case: res = [r.lower() for r in res] if spelling: res = [correct_spelling(r) for r in res] results[subcorpus_name] += Counter(res) if not statsmode: current_iter += 1 if kwargs.get("paralleling", None) is not None: tstr = "%s%d/%d" % (outn, current_iter + 2, total_files) else: tstr = "%s%d/%d" % (outn, current_iter + 1, total_files) # delete temp file if there import os if os.path.isfile("tmp.txt"): os.remove("tmp.txt") ############################################ # Get concordances into DataFrame # ############################################ if conc: all_conc_lines = [] for sc_name, resu in sorted(results.items()): if only_unique: unique_results = uniquify(resu) else: unique_results = resu # make into series pindex = "c f s l m r".encode("utf-8").split() for fname, spkr, start, word, end in unique_results: spkr = unicode(spkr, errors="ignore") fname = os.path.basename(fname) # the use of ascii here makes sure the string formats ok, but will also screw over # anyone doing non-english work. so, change to utf-8, then fix errors as they come # in the corpkit-gui "add_conc_lines_to_window" function all_conc_lines.append( Series( [ sc_name.encode("ascii", errors="ignore"), fname.encode("ascii", errors="ignore"), spkr.encode("ascii", errors="ignore"), start.encode("ascii", errors="ignore"), word.encode("ascii", errors="ignore"), end.encode("ascii", errors="ignore"), ], index=pindex, ) ) # randomise results... if random: from random import shuffle shuffle(all_conc_lines) df = pd.concat(all_conc_lines, axis=1).T # not doing anything yet --- this is for multimodal concordancing add_links = False if not add_links: df.columns = ["c", "f", "s", "l", "m", "r"] else: df.columns = ["c", "f", "s", "l", "m", "r", "link"] if all(x == "" for x in list(df["s"].values)): df.drop("s", axis=1, inplace=True) if kwargs.get("note"): kwargs["note"].progvar.set(100) if kwargs.get("printstatus", True): thetime = strftime("%H:%M:%S", localtime()) finalstring = "\n\n%s: Concordancing finished! %d matches.\n" % (thetime, len(df.index)) print finalstring from corpkit.interrogation import Concordance output = Concordance(df) output.query = locs if quicksave: interro.save() return output ############################################ # Get interrogation into DataFrame # ############################################ else: if countmode: df = Series({k: sum(v) for k, v in sorted(results.items())}) tot = df.sum() else: the_big_dict = {} unique_results = set([item for sublist in results.values() for item in sublist]) for word in unique_results: the_big_dict[word] = [subcorp_result[word] for subcorp_result in sorted(results.values())] # turn master dict into dataframe, sorted df = DataFrame(the_big_dict, index=sorted(results.keys())) numentries = len(df.columns) tot = df.sum(axis=1) total_total = df.sum().sum() ############################################ # Format, output as Interrogation object # ############################################ if not countmode: if not corpus.subcorpora or singlefile: if not files_as_subcorpora: if not kwargs.get("df1_always_df"): df = Series(df.ix[0]) df.sort(ascending=False) tot = df.sum() numentries = len(df.index) total_total = tot # sort by total if type(df) == pd.core.frame.DataFrame: if not df.empty: df.ix["Total-tmp"] = df.sum() the_tot = df.ix["Total-tmp"] df = df[the_tot.argsort()[::-1]] df = df.drop("Total-tmp", axis=0) # format final string if kwargs.get("printstatus", True): thetime = strftime("%H:%M:%S", localtime()) finalstring = "\n\n%s: Interrogation finished!" % thetime if countmode: finalstring += " %d matches." % tot else: finalstring += " %d unique results, %d total occurrences." % (numentries, total_total) print finalstring interro = Interrogation(results=df, totals=tot, query=locs) if quicksave: interro.save() return interro
def interrogator(corpus, search, query = 'any', show = 'w', exclude = False, excludemode = 'any', searchmode = 'all', dep_type = 'collapsed-ccprocessed-dependencies', case_sensitive = False, quicksave = False, just_speakers = False, preserve_case = False, lemmatag = False, files_as_subcorpora = False, conc = False, only_unique = False, random = False, only_format_match = False, multiprocess = False, spelling = False, regex_nonword_filter = r'[A-Za-z0-9:_]', gramsize = 2, split_contractions = False, **kwargs): """interrogate corpus, corpora, subcorpus and file objects see corpkit.interrogation.interrogate() for docstring""" # store kwargs locs = locals() from interrogation import Interrogation from process import tregex_engine import pandas as pd from pandas import DataFrame, Series from collections import Counter from other import as_regex from process import get_deps from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) from textprogressbar import TextProgressBar from process import animator from dictionaries.word_transforms import wordlist, taglemma import corenlp_xml import codecs # find out if using gui root = kwargs.get('root') note = kwargs.get('note') # convert path to corpus object if type(corpus) == str: from corpus import Corpus corpus = Corpus(corpus) # figure out how the user has entered the query and normalise from process import searchfixer search, search_iterable = searchfixer(search, query) # for better printing of query, esp during multiprocess # can remove if multiprocess printing improved if len(list(search.keys())) == 1: query = list(search.values())[0] if 'l' in show and search.get('t'): from nltk.stem.wordnet import WordNetLemmatizer lmtzr=WordNetLemmatizer() if type(show) == str: show = [show] def is_multiquery(corpus, search, query, just_speakers): """determine if multiprocessing is needed do some retyping if need be as well""" im = False from collections import OrderedDict if hasattr(corpus, '__iter__'): im = True # so we can do search = 't', query = ['NP', 'VP']: if type(query) == list: if query != list(search.values())[0] or len(list(search.keys())) > 1: query = {c.title(): c for c in query} if type(query) == dict or type(query) == OrderedDict: im = True if just_speakers: if just_speakers == 'each': im = True just_speakers = ['each'] if just_speakers == ['each']: im = True if type(just_speakers) == str: im = False just_speakers = [just_speakers] if type(just_speakers) == list: if len(just_speakers) > 1: im = True if type(search) == dict: if all(type(i) == dict for i in list(search.values())): im = True return im, corpus, search, query, just_speakers def slow_tregex(sents, **dummy_args): """do the speaker-specific version of tregex queries""" import os from process import tregex_engine # first, put the relevant trees into temp file if kwargs.get('outname'): to_open = 'tmp-%s.txt' % kwargs['outname'] else: to_open = 'tmp.txt' to_write = '\n'.join([sent._parse_string.strip() for sent in sents \ if sent.parse_string is not None]) to_write.encode('utf-8', errors = 'ignore') with open(to_open, "w") as fo: fo.write(to_write) q = list(search.values())[0] res = tregex_engine(query = q, options = ['-o', '-%s' % translated_option], corpus = to_open, root = root, preserve_case = True) if root: root.update() os.remove(to_open) if countmode: return(len(res)) else: return res def get_stats(sents, **dummy_args): """get a bunch of frequencies on interpersonal phenomena""" import os import re from collections import Counter statsmode_results = Counter() # first, put the relevant trees into temp file if kwargs.get('outname'): to_open = 'tmp-%s.txt' % kwargs['outname'] else: to_open = 'tmp.txt' with open(to_open, "w") as fo: for sent in sents: statsmode_results['Sentences'] += 1 sts = sent.parse_string.rstrip() encd = sts.encode('utf-8', errors = 'ignore') + '\n' fo.write(encd) deps = get_deps(sent, dep_type) numpass = len([x for x in deps.links if x.type.endswith('pass')]) statsmode_results['Passives'] += numpass statsmode_results['Tokens'] += len(sent.tokens) words = [w.word for w in sent.tokens if w.word.isalnum()] statsmode_results['Words'] += len(words) statsmode_results['Characters'] += len(''.join(words)) # count moods via trees (/\?/ !< __) from dictionaries.process_types import processes from other import as_regex tregex_qs = {'Imperative': r'ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/', 'Open interrogative': r'ROOT < SBARQ <<- (/\?/ !< __)', 'Closed interrogative': r'ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))', 'Unmodalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))', 'Modalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))', 'Open class words': r'/^(NN|JJ|VB|RB)/ < __', 'Closed class words': r'__ !< __ !> /^(NN|JJ|VB|RB)/', 'Clauses': r'/^S/ < __', 'Interrogative': r'ROOT << (/\?/ !< __)', 'Mental processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.mental, boundaries = 'w'), 'Verbal processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.verbal, boundaries = 'w'), 'Relational processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.relational, boundaries = 'w') } for name, q in sorted(tregex_qs.items()): res = tregex_engine(query = q, options = ['-o', '-C'], corpus = to_open, root = root) statsmode_results[name] += int(res) global numdone numdone += 1 if root: root.update() else: tot_string = str(numdone + 1) + '/' + str(total_files) if kwargs.get('outname'): tot_string = '%s: %s' % (kwargs['outname'], tot_string) animator(p, numdone, tot_string, **par_args) if kwargs.get('note', False): kwargs['note'].progvar.set((numdone * 100.0 / total_files / denom) + startnum) os.remove(to_open) return statsmode_results def make_conc_lines_from_whole_mid(wholes, middle_column_result, speakr = False): if speakr is False: speakr = '' conc_lines = [] # remove duplicates from results unique_wholes = [] unique_middle_column_result = [] duplicates = [] for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)): if '-join-'.join([f, whole, mid]) not in duplicates: duplicates.append('-join-'.join([f, whole, mid])) unique_wholes.append([f, whole]) unique_middle_column_result.append(mid) # split into start, middle and end, dealing with multiple occurrences for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)): reg = re.compile(r'([^a-zA-Z0-9-]|^)(' + re.escape(mid) + r')([^a-zA-Z0-9-]|$)', re.IGNORECASE | re.UNICODE) offsets = [(m.start(), m.end()) for m in re.finditer(reg,whole)] for offstart, offend in offsets: start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip() conc_lines.append([os.path.basename(f), speakr, start, middle, end]) return conc_lines def uniquify(conc_lines): from collections import OrderedDict unique_lines = [] checking = [] for index, (f, speakr, start, middle, end) in enumerate(conc_lines): joined = ' '.join([speakr, start, 'MIDDLEHERE:', middle, ':MIDDLEHERE', end]) if joined not in checking: unique_lines.append(conc_lines[index]) checking.append(joined) return unique_lines def lemmatiser(list_of_words, tag): """take a list of unicode words and a tag and return a lemmatised list.""" output = [] for word in list_of_words: if translated_option.startswith('u'): if word.lower() in list(taglemma.keys()): word = taglemma[word.lower()] else: if word == 'x': word = 'Other' # only use wordnet lemmatiser when appropriate else: if word in wordlist: word = wordlist[word] word = lmtzr.lemmatize(word, tag) output.append(word) return output def gettag(query, lemmatag = False): """ Find tag for WordNet lemmatisation """ import re tagdict = {'N': 'n', 'A': 'a', 'V': 'v', 'A': 'r', 'None': False, '': False, 'Off': False} if lemmatag is False: tag = 'n' # same default as wordnet # attempt to find tag from tregex query tagfinder = re.compile(r'^[^A-Za-z]*([A-Za-z]*)') tagchecker = re.compile(r'^[A-Z]{1,4}$') qr = query.replace(r'\w', '').replace(r'\s', '').replace(r'\b', '') treebank_tag = re.findall(tagfinder, qr) if re.match(tagchecker, treebank_tag[0]): tag = tagdict.get(treebank_tag[0], 'n') elif lemmatag: tag = lemmatag return tag def format_tregex(results): """format tregex by show list""" if countmode: return results import re done = [] if 'l' in show or 'pl' in show: lemmata = lemmatiser(results, gettag(search.get('t'), lemmatag)) else: lemmata = [None for i in results] for word, lemma in zip(results, lemmata): bits = [] if exclude and exclude.get('w'): if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('w'), word): continue if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('l'), lemma): continue if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('p'), word): continue if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('pl'), lemma): continue if exclude and excludemode == 'all': num_to_cause_exclude = len(list(exclude.keys())) current_num = 0 if exclude.get('w'): if re.search(exclude.get('w'), word): current_num += 1 if exclude.get('l'): if re.search(exclude.get('l'), lemma): current_num += 1 if exclude.get('p'): if re.search(exclude.get('p'), word): current_num += 1 if exclude.get('pl'): if re.search(exclude.get('pl'), lemma): current_num += 1 if current_num == num_to_cause_exclude: continue for i in show: if i == 't': bits.append(word) if i == 'l': bits.append(lemma) elif i == 'w': bits.append(word) elif i == 'p': bits.append(word) elif i == 'pl': bits.append(lemma) joined = '/'.join(bits) done.append(joined) return done def tok_by_list(pattern, list_of_toks, concordancing = False, **kwargs): """search for regex in plaintext corpora""" import re if type(pattern) == str: pattern = [pattern] if not case_sensitive: pattern = [p.lower() for p in pattern] if not concordancing: if case_sensitive: matches = [m for m in list_of_toks if m in pattern] else: matches = [m for m in list_of_toks if m.lower() in pattern] else: matches = [] for index, token in enumerate(list_of_toks): if token in pattern: match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(token) match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140]) matches.append(match) if countmode: return(len(matches)) else: return matches def unsplitter(lst): """unsplit contractions and apostophes from tokenised text""" if split_contractions: return lst unsplit = [] for index, t in enumerate(lst): if index == 0 or index == len(lst) - 1: unsplit.append(t) continue if "'" in t and not t.endswith("'"): rejoined = ''.join([lst[index - 1], t]) unsplit.append(rejoined) else: if not "'" in lst[index + 1]: unsplit.append(t) return unsplit def tok_ngrams(pattern, list_of_toks, concordancing = False, split_contractions = True): from collections import Counter import re ngrams = Counter() result = [] # if it's not a compiled regex list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)] if pattern.lower() == 'any': pattern = r'.*' if not split_contractions: list_of_toks = unsplitter(list_of_toks) #list_of_toks = [x for x in list_of_toks if "'" not in x] for index, w in enumerate(list_of_toks): try: the_gram = [list_of_toks[index+x] for x in range(gramsize)] if not any(re.search(pattern, x) for x in the_gram): continue ngrams[' '.join(the_gram)] += 1 except IndexError: pass # turn counter into list of results for k, v in list(ngrams.items()): if v > 1: for i in range(v): result.append(k) if countmode: return(len(result)) else: return result def compiler(pattern): """compile regex or fail gracefully""" import re try: if case_sensitive: comped = re.compile(pattern) else: comped = re.compile(pattern, re.IGNORECASE) return comped except: import traceback import sys from time import localtime, strftime exc_type, exc_value, exc_traceback = sys.exc_info() lst = traceback.format_exception(exc_type, exc_value, exc_traceback) error_message = lst[-1] thetime = strftime("%H:%M:%S", localtime()) print('%s: Query %s' % (thetime, error_message)) if root: return 'Bad query' else: raise ValueError('%s: Query %s' % (thetime, error_message)) def tok_by_reg(pattern, list_of_toks, concordancing = False, **kwargs): """search for regex in plaintext corpora""" import re comped = compiler(pattern) if comped == 'Bad query': return 'Bad query' if not concordancing: matches = [m for m in list_of_toks if re.search(comped, m)] else: matches = [] for index, token in enumerate(list_of_toks): if re.search(comped, token): match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(re.search(comped, token).group(0)) match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140]) matches.append(match) if countmode: return(len(matches)) else: return matches def plaintext_regex_search(pattern, plaintext_data, concordancing = False, **kwargs): """search for regex in plaintext corpora it searches over lines, so the user needs to be careful. """ import re if concordancing: pattern = r'(.{,140})\b(' + pattern + r')\b(.{,140})' compiled_pattern = compiler(pattern) if compiled_pattern == 'Bad query': return 'Bad query' matches = re.findall(compiled_pattern, plaintext_data) if concordancing: matches = [list(m) for m in matches] if not concordancing: for index, i in enumerate(matches): if type(i) == tuple: matches[index] = i[0] if countmode: return(len(matches)) else: return matches def correct_spelling(a_string): if not spelling: return a_string from dictionaries.word_transforms import usa_convert if spelling.lower() == 'uk': usa_convert = {v: k for k, v in list(usa_convert.items())} spell_out = [] bits = a_string.split('/') for index, i in enumerate(bits): converted = usa_convert.get(i.lower(), i) if i.islower() or preserve_case is False: converted = converted.lower() elif i.isupper() and preserve_case: converted = converted.upper() elif i.istitle() and preserve_case: converted = converted.title() bits[index] = converted r = '/'.join(bits) return r def plaintext_simple_search(pattern, plaintext_data, concordancing = False, **kwargs): """search for tokens in plaintext corpora""" import re result = [] if type(pattern) == str: pattern = [pattern] for p in pattern: if concordancing: pat = r'(.{0,140})\b(' + re.escape(p) + r')\b(.{0,140})' pat = compiler(pat) if pat == 'Bad query': return 'Bad query' matches = re.findall(pat, plaintext_data) if concordancing: matches = [list(m) for m in matches] for i in matches: result.append(i) else: for m in range(len(matches)): result.append(p) return result # do multiprocessing if need be im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers) locs['search'] = search locs['query'] = query locs['just_speakers'] = just_speakers locs['corpus'] = corpus locs['multiprocess'] = multiprocess if im: from multiprocess import pmultiquery return pmultiquery(**locs) datatype = corpus.datatype singlefile = corpus.singlefile # store all results in here results = {} # check if just counting countmode = 'c' in show # where we are at in interrogation current_iter = 0 # multiprocessing progress bar denom = kwargs.get('denominator', 1) startnum = kwargs.get('startnum', 0) ############################################ # Determine the search function to be used # ############################################ # simple tregex is tregex over whole dirs simple_tregex_mode = False statsmode = False if not just_speakers and 't' in list(search.keys()): simple_tregex_mode = True else: if corpus.datatype == 'plaintext': if search.get('n'): raise NotImplementedError('Use a tokenised corpus for n-gramming.') #searcher = plaintext_ngram optiontext = 'n-grams via plaintext' if search.get('w'): if kwargs.get('regex', True): searcher = plaintext_regex_search else: searcher = plaintext_simple_search optiontext = 'Searching plaintext' elif corpus.datatype == 'tokens': if search.get('n'): searcher = tok_ngrams optiontext = 'n-grams via tokens' elif search.get('w'): if kwargs.get('regex', True): searcher = tok_by_reg else: searcher = tok_by_list if type(search.get('w')) == list: searcher = tok_by_list optiontext = 'Searching tokens' only_parse = ['r', 'd', 'g', 'dl', 'gl', 'df', 'gf', 'dp', 'gp', 'f'] if corpus.datatype != 'parse' and any(i in only_parse for i in list(search.keys())): raise ValueError('Need parsed corpus to search with "%s" option(s).' % ', '.join([i for i in list(search.keys()) if i in only_parse])) elif corpus.datatype == 'parse': if search.get('t'): searcher = slow_tregex elif search.get('s'): searcher = get_stats statsmode = True optiontext = 'General statistics' global numdone numdone = 0 else: from depsearch import dep_searcher searcher = dep_searcher optiontext = 'Dependency querying' ############################################ # Set some Tregex-related values # ############################################ if search.get('t'): query = search.get('t') # check the query q = tregex_engine(corpus = False, query = search.get('t'), options = ['-t'], check_query = True, root = root) if query is False: if root: return 'Bad query' else: return optiontext = 'Searching parse trees' if 'p' in show or 'pl' in show: translated_option = 'u' if type(search['t']) == list: search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)' elif 't' in show: translated_option = 'o' if type(search['t']) == list: search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)' elif 'w' in show: translated_option = 't' if type(search['t']) == list: search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'/.?[A-Za-z0-9].?/ !< __' elif 'c' in show: count_results = {} only_count = True translated_option = 'C' if type(search['t']) == list: search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'/.?[A-Za-z0-9].?/ !< __' elif 'l' in show: translated_option = 't' if type(search['t']) == list: search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'/.?[A-Za-z0-9].?/ !< __' query = search['t'] ############################################ # Make iterable for corpus/subcorpus/file # ############################################ if corpus.singlefile: to_iterate_over = {(corpus.name, corpus.path): [corpus]} elif not corpus.subcorpora: to_iterate_over = {(corpus.name, corpus.path): corpus.files} else: to_iterate_over = {} for k, v in sorted(corpus.structure.items(), key=lambda obj: obj[0].name): to_iterate_over[(k.name, k.path)] = v if files_as_subcorpora: to_iterate_over = {} for f in corpus.files: to_iterate_over[(f.name, f.path)] = [f] ############################################ # Print welcome message # ############################################ if conc: message = 'Concordancing' else: message = 'Interrogating' if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) sformat = '\n '.join(['%s: %s' % (k.rjust(3), v) for k, v in list(search.items())]) if search == {'s': r'.*'}: sformat = 'features' welcome = '\n%s: %s %s ...\n %s\n Query: %s\n' % \ (thetime, message, corpus.name, optiontext, sformat) print(welcome) ############################################ # Make progress bar # ############################################ if simple_tregex_mode: total_files = len(list(to_iterate_over.keys())) else: if search.get('s'): total_files = sum([len(x) for x in list(to_iterate_over.values())]) * 12 else: total_files = sum([len(x) for x in list(to_iterate_over.values())]) par_args = {'printstatus': kwargs.get('printstatus', True), 'root': root, 'note': note, 'length': total_files} term = None if kwargs.get('paralleling', None) is not None: from blessings import Terminal term = Terminal() par_args['terminal'] = term par_args['linenum'] = kwargs.get('paralleling') outn = kwargs.get('outname', '') if outn: outn = outn + ': ' tstr = '%s%d/%d' % (outn, current_iter, total_files) p = animator(None, None, init = True, tot_string = tstr, **par_args) tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) ############################################ # Iterate over data, doing interrogations # ############################################ for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()): if countmode or conc: results[subcorpus_name] = [] else: results[subcorpus_name] = Counter() # tregex over subcorpora, not files if simple_tregex_mode: op = ['-o', '-' + translated_option] result = tregex_engine(query = search['t'], options = op, corpus = subcorpus_path, root = root, preserve_case = preserve_case) current_iter += 1 if not countmode: result = Counter(format_tregex(result)) if conc: op.append('-w') whole_result = tregex_engine(query = search['t'], options = op, corpus = subcorpus_path, root = root, preserve_case = preserve_case) if not only_format_match: whole_result = format_tregex(whole_result) result = make_conc_lines_from_whole_mid(whole_result, result, speakr = False) if spelling: for index, line in enumerate(result): result[index] = [correct_spelling(b) for b in line] if countmode: results[subcorpus_name].append(result) else: results[subcorpus_name] += result if kwargs.get('paralleling', None) is not None: tstr = '%s%d/%d' % (outn, current_iter + 2, total_files) else: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) # dependencies, plaintext, tokens or slow_tregex else: for f in files: if corpus.datatype == 'parse': with open(f.path, 'r') as data: data = data.read() from corenlp_xml.document import Document try: corenlp_xml = Document(data) except: print('Could not read file: %s' % f.path) continue if just_speakers: sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers] if not sents: continue else: sents = corenlp_xml.sentences res = searcher(sents, search = search, show = show, dep_type = dep_type, exclude = exclude, excludemode = excludemode, searchmode = searchmode, lemmatise = False, case_sensitive = case_sensitive, concordancing = conc, only_format_match = only_format_match) if res == 'Bad query': return 'Bad query' if searcher == slow_tregex and not countmode: res = format_tregex(res) elif corpus.datatype == 'tokens': import pickle with codecs.open(f.path, "rb") as fo: data = pickle.load(fo) res = searcher(list(search.values())[0], data, split_contractions = split_contractions, concordancing = conc) if conc: for index, line in enumerate(res): line.insert(0, '') elif corpus.datatype == 'plaintext': with codecs.open(f.path, 'rb', encoding = 'utf-8') as data: data = data.read() res = searcher(list(search.values())[0], data, concordancing = conc) if conc: for index, line in enumerate(res): line.insert(0, '') if countmode: results[subcorpus_name] += res continue # add filename and do lowercasing for conc if conc: for index, line in enumerate(res): line.insert(0, f.name) if not preserve_case: line = [b.lower() for b in line] if spelling: line = [correct_spelling(b) for b in line] results[subcorpus_name] += [line] # do lowercasing and spelling else: if not preserve_case: res = [r.lower() for r in res] if spelling: res = [correct_spelling(r) for r in res] results[subcorpus_name] += Counter(res) if not statsmode: current_iter += 1 if kwargs.get('paralleling', None) is not None: tstr = '%s%d/%d' % (outn, current_iter + 2, total_files) else: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) # delete temp file if there import os if os.path.isfile('tmp.txt'): os.remove('tmp.txt') ############################################ # Get concordances into DataFrame # ############################################ if conc: all_conc_lines = [] for sc_name, resu in sorted(results.items()): if only_unique: unique_results = uniquify(resu) else: unique_results = resu #make into series pindex = 'c f s l m r'.encode('utf-8').split() for fname, spkr, start, word, end in unique_results: #spkr = str(spkr, errors = 'ignore') fname = os.path.basename(fname) # the use of ascii here makes sure the string formats ok, but will also screw over # anyone doing non-english work. so, change to utf-8, then fix errors as they come # in the corpkit-gui "add_conc_lines_to_window" function all_conc_lines.append(Series([sc_name, fname, \ spkr, \ start, \ word, \ end], \ index = pindex)) # randomise results... if random: from random import shuffle shuffle(all_conc_lines) df = pd.concat(all_conc_lines, axis = 1).T # not doing anything yet --- this is for multimodal concordancing add_links = False if not add_links: df.columns = ['c', 'f', 's', 'l', 'm', 'r'] else: df.columns = ['c', 'f', 's', 'l', 'm', 'r', 'link'] if all(x == '' for x in list(df['s'].values)): df.drop('s', axis = 1, inplace = True) if kwargs.get('note'): kwargs['note'].progvar.set(100) if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) finalstring = '\n\n%s: Concordancing finished! %d matches.\n' % (thetime, len(df.index)) print(finalstring) from interrogation import Concordance output = Concordance(df) output.query = locs if quicksave: interro.save() return output ############################################ # Get interrogation into DataFrame # ############################################ else: if countmode: df = Series({k: sum(v) for k, v in sorted(results.items())}) tot = df.sum() else: the_big_dict = {} unique_results = set([item for sublist in list(results.values()) for item in sublist]) for word in unique_results: the_big_dict[word] = [subcorp_result[word] for name, subcorp_result in sorted(results.items(), key=lambda x: x[0])] # turn master dict into dataframe, sorted df = DataFrame(the_big_dict, index = sorted(results.keys())) numentries = len(df.columns) tot = df.sum(axis = 1) total_total = df.sum().sum() ############################################ # Format, output as Interrogation object # ############################################ if not countmode: if not corpus.subcorpora or singlefile: if not files_as_subcorpora: if not kwargs.get('df1_always_df'): df = Series(df.ix[0]) df.sort(ascending = False) tot = df.sum() numentries = len(df.index) total_total = tot # sort by total if type(df) == pd.core.frame.DataFrame: if not df.empty: df.ix['Total-tmp'] = df.sum() the_tot = df.ix['Total-tmp'] df = df[the_tot.argsort()[::-1]] df = df.drop('Total-tmp', axis = 0) # format final string if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) finalstring = '\n\n%s: Interrogation finished!' % thetime if countmode: finalstring += ' %d matches.' % tot else: finalstring += ' %d unique results, %d total occurrences.' % (numentries, total_total) print(finalstring) interro = Interrogation(results = df, totals = tot, query = locs) if quicksave: interro.save() return interro
def test_value_counts_inferred(self): klasses = [Index, Series] for klass in klasses: s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"]) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.unique(s_values)) self.assertEqual(s.nunique(), 4) # don't sort, have to sort after the fact as not sorting is platform-dep hist = s.value_counts(sort=False) hist.sort() expected = Series([3, 1, 4, 2], index=list("acbd")) expected.sort() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) expected = Series([1, 2, 3, 4], index=list("cdab")) tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"]) tm.assert_series_equal(hist, expected) # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({0.998: 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3])) self.assertEqual(s1.nunique(), 3) res4 = s1.value_counts(bins=4) exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"] s = klass(s_values) expected = Series([4, 3, 2], index=["b", "a", "d"]) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.array(["a", "b", np.nan, "d"], dtype="O")) self.assertEqual(s.nunique(), 3) s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.array([])) self.assertEqual(s.nunique(), 0) # GH 3002, datetime64[ns] txt = "\n".join( [ "xxyyzz20100101PIE", "xxyyzz20100101GUM", "xxyyzz20100101EGG", "xxyyww20090101EGG", "foofoo20080909PIE", "foofoo20080909GUM", ] ) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) s = klass(df["dt"].copy(), name="dt") idx = pd.to_datetime(["2010-01-01 00:00:00Z", "2008-09-09 00:00:00Z", "2009-01-01 00:00:00X"]) expected_s = Series([3, 2, 1], index=idx, name="dt") tm.assert_series_equal(s.value_counts(), expected_s) expected = np.array( ["2010-01-01 00:00:00Z", "2009-01-01 00:00:00Z", "2008-09-09 00:00:00Z"], dtype="datetime64[ns]" ) if isinstance(s, DatetimeIndex): expected = DatetimeIndex(expected) self.assertTrue(s.unique().equals(expected)) else: self.assert_numpy_array_equal(s.unique(), expected) self.assertEqual(s.nunique(), 3) # with NaT s = df["dt"].copy() s = klass([v for v in s.values] + [pd.NaT], name="dt") result = s.value_counts() self.assertEqual(result.index.dtype, "datetime64[ns]") tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) unique = s.unique() self.assertEqual(unique.dtype, "datetime64[ns]") # numpy_array_equal cannot compare pd.NaT self.assert_numpy_array_equal(unique[:3], expected) self.assertTrue(unique[3] is pd.NaT or unique[3].astype("int64") == pd.tslib.iNaT) self.assertEqual(s.nunique(), 3) self.assertEqual(s.nunique(dropna=False), 4) # timedelta64[ns] td = df.dt - df.dt + timedelta(1) td = klass(td, name="dt") result = td.value_counts() expected_s = Series([6], index=[Timedelta("1day")], name="dt") tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(["1 days"]) if isinstance(td, TimedeltaIndex): self.assertTrue(td.unique().equals(expected)) else: self.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2, name="dt") result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s)
ser1 ser1.sort_index() ser1 ser1.order() from numpy.random import randn ser2 = Series(randn(10)) ser2 ser2.rank() ser2 ser2.sort() ser2 ser2.rank() #------------------ # lec022 #------------------ arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]]) arr dframe1 = DataFrame(arr, index=['A', 'B'], columns=['One', 'Two', 'Three']) dframe1 # columnで集約 dframe1.sum()
def plot_lowlevel(plot_spec: pd.Series, ax: 'matplotlib.pyplot.Axes', conditions: pd.Series, ms: pd.DataFrame, plot_sim: bool) -> 'matplotlib.pyplot.Axes': """ Plotting routine / preparations: set properties of figure and plot the data with given specifications (lineplot with errorbars, or barplot) Parameters: plot_spec: contains defined data format (visualization file) ax: axes to which to plot conditions: Values on x-axis ms: contains measurement data which should be plotted plot_sim: tells whether or not simulated data should be plotted as well Returns: Updated axis object. """ warnings.warn("This function will be removed in future releases. ", DeprecationWarning) # set yScale if plot_spec[Y_SCALE] == LIN: ax.set_yscale("linear") elif plot_spec[Y_SCALE] == LOG10: ax.set_yscale("log") elif plot_spec[Y_SCALE] == LOG: ax.set_yscale("log", base=np.e) # add yOffset ms.loc[:, 'mean'] = ms['mean'] + plot_spec[Y_OFFSET] ms.loc[:, 'repl'] = ms['repl'] + plot_spec[Y_OFFSET] if plot_sim: ms.loc[:, 'sim'] = ms['sim'] + plot_spec[Y_OFFSET] # set type of noise if plot_spec[PLOT_TYPE_DATA] == MEAN_AND_SD: noise_col = 'sd' elif plot_spec[PLOT_TYPE_DATA] == MEAN_AND_SEM: noise_col = 'sem' elif plot_spec[PLOT_TYPE_DATA] == PROVIDED: noise_col = 'noise_model' if plot_spec.plotTypeSimulation == LINE_PLOT: # set xScale if plot_spec[X_SCALE] == LIN: ax.set_xscale("linear") elif plot_spec[X_SCALE] == LOG10: ax.set_xscale("log") elif plot_spec[X_SCALE] == LOG: ax.set_xscale("log", base=np.e) # equidistant elif plot_spec[X_SCALE] == 'order': ax.set_xscale("linear") # check if conditions are monotone decreasing or increasing if np.all(np.diff(conditions) < 0): # monot. decreasing xlabel = conditions[::-1] # reversing conditions = range(len(conditions))[::-1] # reversing ax.set_xticks(range(len(conditions)), xlabel) elif np.all(np.diff(conditions) > 0): xlabel = conditions conditions = range(len(conditions)) ax.set_xticks(range(len(conditions)), xlabel) else: raise ValueError('Error: x-conditions do not coincide, ' 'some are mon. increasing, some monotonically' ' decreasing') # add xOffset conditions = conditions + plot_spec[X_OFFSET] # plotting all measurement data label_base = plot_spec[LEGEND_ENTRY] if plot_spec[PLOT_TYPE_DATA] == REPLICATE: p = ax.plot( conditions[conditions.index.values], ms.repl[ms.repl.index.values], 'x', label=label_base ) # construct errorbar-plots: noise specified above else: # sort index for the case that indices of conditions and # measurements differ if indep_var='time', conditions is a numpy # array, for indep_var=observable its a Series if isinstance(conditions, np.ndarray): conditions.sort() elif isinstance(conditions, pd.core.series.Series): conditions.sort_index(inplace=True) else: raise ValueError('Strange: conditions object is neither numpy' ' nor series...') ms.sort_index(inplace=True) # sorts according to ascending order of conditions scond, smean, snoise = \ zip(*sorted(zip(conditions, ms['mean'], ms[noise_col]))) p = ax.errorbar( scond, smean, snoise, linestyle='-.', marker='.', label=label_base ) # construct simulation plot colors = p[0].get_color() if plot_sim: xs, ys = zip(*sorted(zip(conditions, ms['sim']))) ax.plot( xs, ys, linestyle='-', marker='o', label=label_base + " simulation", color=colors ) # construct bar plot elif plot_spec[PLOT_TYPE_SIMULATION] == BAR_PLOT: x_name = plot_spec[LEGEND_ENTRY] if plot_sim: bar_kwargs = { 'align': 'edge', 'width': -1/3, } else: bar_kwargs = { 'align': 'center', 'width': 2/3, } p = ax.bar(x_name, ms['mean'], yerr=ms[noise_col], color=sns.color_palette()[0], **bar_kwargs) if plot_sim: colors = p[0].get_facecolor() bar_kwargs['width'] = -bar_kwargs['width'] ax.bar(x_name, ms['sim'], color='white', edgecolor=colors, **bar_kwargs) # construct scatter plot elif plot_spec[PLOT_TYPE_SIMULATION] == SCATTER_PLOT: if not plot_sim: raise NotImplementedError('Scatter plots do not work without' ' simulation data') ax.scatter(ms['mean'], ms['sim'], label=plot_spec[LEGEND_ENTRY]) ax = square_plot_equal_ranges(ax) # show 'e' as basis not 2.7... in natural log scale cases def ticks(y, _): return r'$e^{{{:.0f}}}$'.format(np.log(y)) if plot_spec[X_SCALE] == LOG: ax.xaxis.set_major_formatter(mtick.FuncFormatter(ticks)) if plot_spec[Y_SCALE] == LOG: ax.yaxis.set_major_formatter(mtick.FuncFormatter(ticks)) # set further plotting/layout settings if not plot_spec[PLOT_TYPE_SIMULATION] == BAR_PLOT: ax.legend() ax.set_title(plot_spec[PLOT_NAME]) ax.relim() ax.autoscale_view() return ax
ser1.sort_index() # In[3]: ser1.order() # In[4]: from numpy.random import randn ser2 = Series(randn(10)) ser2 # In[6]: ser2.sort(inplace=True) # In[7]: ser2.rank() # In[8]: ser3 = Series(randn(10)) ser3 # In[9]: ser3.rank() # In[ ]:
meta['month'] = meta['date'].apply(lambda x: x.replace(day=1)) # Get captions for images. topics_dict = {} with open(doc_topic_words_filepath) as f: for line in f: data = line.strip().split(',') topic = data[0] words = data[1:] topics_dict[topic] = words # Write out images. topics = topics_dict.keys() topics.sort() index = [] for year in range(1973, 1977): for month in range(1, 13): date = datetime.datetime.strptime("1/%s/%s" % (month, year), "%d/%m/%Y") index.append(date) important_classifications = ['CONFIDENTIAL', 'UNCLASSIFIED', 'LIMITED OFFICIAL USE', 'SECRET'] colors = {'CONFIDENTIAL' : 'm-', 'UNCLASSIFIED' : 'c-', 'LIMITED OFFICIAL USE' : 'y-',
def statistics(request): """ This function is called when the Statistics button is pressed by the user. It's purpose is to take the selected platforms as well as some statistical parameters and perform two statistical functions: a T-Test and an FDR analysis :param request: :return: a rendered HTML page. """ cutoff_type = request.GET.get('cutoff_type') cutoff_value = float(request.GET.get('cutoff_value')) display_values = request.session.get('display_values', {}) spps = request.GET.get('spps') spps = spps.split(',') combined_series = [] display_profile = None for spp in spps: _, study, display_profile, platform = spp.split('|') profile = display_profile.replace('_', '-') sample_ids = geo_data.get_sample_ids(study, profile, platform) control_sample_ids = [] diseased_sample_ids = [] for sample_id in sample_ids: sample_attributes = geo_data.get_sample_attributes(study, profile, platform, sample_id) if sample_attributes['control']: control_sample_ids.append(sample_id) else: diseased_sample_ids.append(sample_id) genes = geo_data.get_all_gene_symbols(study, profile, platform) no_of_genes = len(genes) control_exprs = zeros((no_of_genes, len(control_sample_ids))) diseased_exprs = zeros((no_of_genes, len(diseased_sample_ids))) for (g_index, gene) in enumerate(genes): gene_exprs = zeros(len(control_sample_ids)) for (s_index, sample_id) in enumerate(control_sample_ids): expr_value = geo_data.get_gene_expression_value(study, profile, platform, sample_id, gene) if expr_value == 'None': continue gene_exprs[s_index] = expr_value control_exprs[g_index] = gene_exprs gene_exprs = zeros(len(diseased_sample_ids)) for (s_index, sample_id) in enumerate(diseased_sample_ids): expr_value = geo_data.get_gene_expression_value(study, profile, platform, sample_id, gene) if expr_value == 'None': continue gene_exprs[s_index] = expr_value diseased_exprs[g_index] = gene_exprs control_df = DataFrame(control_exprs, index=genes, columns=control_sample_ids) diseased_df = DataFrame(diseased_exprs, index=genes, columns=diseased_sample_ids) # Perform the the t-test and create a pandas Series t_statistics, p_values = ttest_ind(control_df.T, diseased_df.T) p_values_series = Series(p_values, index=genes) # Perform the fdr analysis, create a pandas Series and sort the series reject_fdr, pval_fdr = fdr_correction(p_values_series, method='indep') fdr_values_series = Series(pval_fdr, index=genes) p_values_series.sort(ascending=True) combined_series = [] for i in range(len(p_values_series)): symbol = p_values_series.index[i] p_value = p_values_series[i] if cutoff_type == 'p-value' and p_value > cutoff_value: break fdr_value = fdr_values_series.get(symbol) if cutoff_type == 'fdr-value' and fdr_value > cutoff_value: break combined_series.append([symbol, p_value, fdr_value]) display_values[display_profile] = combined_series request.session['display_values'] = display_values response = render_to_string('statistics.html', {display_profile: combined_series}) return HttpResponse(response)
def test_value_counts_inferred(self): klasses = [Index, Series] for klass in klasses: s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.unique(s_values)) self.assertEqual(s.nunique(), 4) # don't sort, have to sort after the fact as not sorting is platform-dep hist = s.value_counts(sort=False) hist.sort() expected = Series([3, 1, 4, 2], index=list('acbd')) expected.sort() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) expected = Series([1, 2, 3, 4], index=list('cdab')) tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(hist, expected) # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({0.998: 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3])) self.assertEqual(s1.nunique(), 3) res4 = s1.value_counts(bins=4) exp4 = Series({ 0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1 }, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series({ 0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25 }, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = [ 'a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b' ] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal( s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O')) self.assertEqual(s.nunique(), 3) s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.array([])) self.assertEqual(s.nunique(), 0) # GH 3002, datetime64[ns] txt = "\n".join([ 'xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG', 'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM' ]) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) s = klass(df['dt'].copy()) idx = pd.to_datetime([ '2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X' ]) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) expected = np.array([ '2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z' ], dtype='datetime64[ns]') if isinstance(s, DatetimeIndex): expected = DatetimeIndex(expected) self.assertTrue(s.unique().equals(expected)) else: self.assert_numpy_array_equal(s.unique(), expected) self.assertEqual(s.nunique(), 3) # with NaT s = df['dt'].copy() s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() self.assertEqual(result.index.dtype, 'datetime64[ns]') tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) unique = s.unique() self.assertEqual(unique.dtype, 'datetime64[ns]') # numpy_array_equal cannot compare pd.NaT self.assert_numpy_array_equal(unique[:3], expected) self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT) self.assertEqual(s.nunique(), 3) self.assertEqual(s.nunique(dropna=False), 4) # timedelta64[ns] td = df.dt - df.dt + timedelta(1) td = klass(td) result = td.value_counts() expected_s = Series([6], index=[Timedelta('1day')]) tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(['1 days']) if isinstance(td, TimedeltaIndex): self.assertTrue(td.unique().equals(expected)) else: self.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2) result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s)
### ### ### ### ############################################################### # go to http://pandas.pydata.org/pandas-docs/stable/cookbook.html for several examples df3 + df4 #adds dataframes df4.add(df3,fill_value=0) # does the same thing, and replaces NaNs with 0 ser3 = df3.ix[0] # forming a series from a dataframe. Here the first row is returned as axis ser3.sort_index() # sorts according to index ser5 = ser4.order() # sorts according to value, but is NOT in place ser4.sort() ## in place sorting df1.sum() #sum columns df1.sum(axis = 1) # sum rows df1.min() # minimum values across columns df1.idxmin() #index of the minimum values df1.cumsum() # returns dataframe with cumulative sums across columns df1.describe() # returns summary stats across columns df.drop_duplicates() # drops duplicate rows
import numpy as np import pandas as pd from pandas import Series, DataFrame ser1 = Series(range(3), index=['C', 'A', 'B']) ser2 = ser1.sort_index() print(ser2) print(ser1.order()) from numpy.random import randn ser3 = Series(randn(10)) print(ser3.rank()) print(ser3.sort())
def check_close_price_by_variety_id(variety_id, instrument_list): global month_str, pre_month_str global close_limit_update_time sixth_instrument = None quote_map = {} for one_file_name in instrument_list: quote_file = open(one_file_name, "r") quote_list = quote_file.readlines() quote_file.close() instrument_id = one_file_name.split("\\")[-1].split(".")[0] close_quote = CBest_Market_Data_Field() if len(quote_list) > 3: if len(quote_list[-1]) > 2: close_quote = Get_CBest_Market_Data_Field_From_Line( quote_list[-2]) else: close_quote = Get_CBest_Market_Data_Field_From_Line( quote_list[-3]) quote_map[instrument_id] = close_quote if len(quote_map) > 2: # 寻找主力合约 best_quote_frame = Series() for (instrument_id, close_quote) in quote_map.items(): if instrument_id[-2:] != month_str: best_quote = Series( [instrument_id, close_quote.Total_Match_Volume]) best_quote_frame = best_quote_frame.append(best_quote) best_quote_frame = Series(best_quote_frame[1].values, index=best_quote_frame[0].values) best_quote_frame.sort() main_instrument = best_quote_frame.index[-1] sub_instrument = best_quote_frame.index[-2] ssub_instrument = best_quote_frame.index[-3] forth_instrument = best_quote_frame.index[-4] fifth_instrument = best_quote_frame.index[-5] if len(best_quote_frame.index) > 5: sixth_instrument = best_quote_frame.index[-6] first_instrument = main_instrument second_instrument = sub_instrument trigger_ref_spread_price = (float(quote_map[main_instrument].Bid_Price1) + float(quote_map[main_instrument].Ask_Price1)) / 2 - \ (float(quote_map[second_instrument].Bid_Price1) + float(quote_map[second_instrument].Ask_Price1)) / 2 tick, exchange_id = get_attribution(variety_id) if second_instrument[-2:] == pre_month_str: trigger_price_range = trigger_price_range_dict[variety_id][ 0] + 10 * tick else: trigger_price_range = trigger_price_range_dict[variety_id][0] extreme_price_range = extreme_price_range_dict[variety_id][0] open_order_volume = open_order_volume_dict[variety_id][0] max_open_order_volume = max_open_order_volume_dict[variety_id][0] stop_tick = stop_tick_dict[variety_id][0] * tick line4 = "\t\t\t<main_instrument>" + main_instrument + "</main_instrument>\n" line6 = "\t\t\t<sub_instrument>" + second_instrument + "</sub_instrument>\n" line7 = "\t\t\t<open_order_volume>" + str( int(open_order_volume)) + "</open_order_volume>\n" line71 = "\t\t\t<stop_tick>" + str(float(stop_tick)) + "</stop_tick>\n" line8 = "\t\t\t<tigger_ref_spread_price>" + str( trigger_ref_spread_price) + "</tigger_ref_spread_price>\n" line9 = "\t\t\t<tigger_price_range>" + str( int(trigger_price_range)) + "</tigger_price_range>\n" line91 = "\t\t\t<price_multiple>" + str( float(tick) * 2) + "</price_multiple>\n" line92 = "\t\t\t<order_multiple>" + str( int(math.ceil( float(open_order_volume) / 2))) + "</order_multiple>\n" line93 = "\t\t\t<max_open_order_volume>" + str( int(max_open_order_volume)) + "</max_open_order_volume>\n" line10 = "\t\t</auction_arbi_variety>\n" line_add1 = "\t\t\t<open_over_take>" + str( float(tick)) + "</open_over_take>\n" line_add2 = "\t\t\t<extreme_price_range>" + str( int(extreme_price_range)) + "</extreme_price_range>\n" line_add4 = "\t\t\t<main_order_volume>" + str( int(main_order_volume)) + "</main_order_volume>\n" line_add5 = "\t\t\t<main_lost_tick>" + str( float(main_lost_tick_num * tick)) + "</main_lost_tick>\n" line_add6 = "\t\t\t<total_volume_limit_ratio>" + str( float(total_volume_limit_ratio)) + "</total_volume_limit_ratio>\n" line_add7 = "\t\t\t<spam_num>" + str(int(spam_num)) + "</spam_num>\n" line_add8 = "\t\t\t<first_level_volume>" + str( int(first_level_volume)) + "</first_level_volume>\n" line_add9 = "\t\t\t<second_level_volume>" + str( int(second_level_volume)) + "</second_level_volume>\n" line_add10 = "\t\t\t<third_level_volume>" + str( int(third_level_volume)) + "</third_level_volume>\n" line_add11 = "\t\t\t<reserved_profit>" + str( reserved_tick * float(tick)) + "</reserved_profit>\n" if exchange_id == 'SHFE': _, mean_volume = get_open_volume_series(main_instrument, trading_day) if variety_id not in latency_variety_id_array: line3 = "\t\t<auction_arbi_variety exchange_id=\"" + exchange_id + "\" variety_id=\"" + variety_id + \ "\" tick=\"" + str(float(tick)) + "\" trigger_time=\"20:58:59.700\"" + " try_czce_order_time=\"20:58:30.000\">\n" else: line3 = "\t\t<auction_arbi_variety exchange_id=\"" + exchange_id + "\" variety_id=\"" + variety_id + \ "\" tick=\"" + str(float(tick)) + "\" trigger_time=\"20:58:59.400\"" + " try_czce_order_time=\"20:58:30.000\">\n" line93 = "\t\t\t<max_open_order_volume>" + str( int(max_open_order_volume / 2)) + "</max_open_order_volume>\n" line_add3 = "\t\t\t<main_average_his_volume>" + str( int(mean_volume * open_volume_ma_limit_ratio) ) + "</main_average_his_volume>\n" print>> reach_limit_price_result_file, line3, line4, line6, line7, line71, line8, line9, line91, line_add1, line_add2, line92, \ line93, line_add3, line_add4, line_add5, line_add6, line_add7, line_add8, line_add9, line_add10, line_add11, line10 reach_limit_price_result_file.flush() else: line3 = "\t\t<auction_arbi_variety exchange_id=\"" + exchange_id + "\" variety_id=\"" + variety_id + \ "\" tick=\"" + str(float(tick)) + "\" trigger_time=\"20:58:59.450\"" + " try_czce_order_time=\"20:58:30.000\">\n" line_add3 = "\t\t\t<main_average_his_volume>" + str( 200) + "</main_average_his_volume>\n" print>> reach_limit_price_result_file, line3, line4, line6, line7, line71, line8, line9, line91, line_add1, line_add2, line92, \ line93, line_add3, line_add4, line_add5, line_add6, line_add7, line_add8, line_add9, line_add10, line_add11, line10 reach_limit_price_result_file.flush() if variety_id in two_variety_list: second_instrument = ssub_instrument trigger_price_range = trigger_price_range_dict[variety_id][1] trigger_ref_spread_price = (float(quote_map[main_instrument].Bid_Price1) + float(quote_map[main_instrument].Ask_Price1)) / 2 - \ (float(quote_map[second_instrument].Bid_Price1) + float(quote_map[second_instrument].Ask_Price1)) / 2 ref_spread_last_price = float( quote_map[main_instrument].Last_Price) - float( quote_map[second_instrument].Last_Price) bid_ask_spread = abs(ref_spread_last_price - trigger_ref_spread_price) / tick if bid_ask_spread < 15 and quote_map[ second_instrument].Update_Time > close_limit_update_time: line6 = "\t\t\t<sub_instrument>" + second_instrument + "</sub_instrument>\n" line7 = "\t\t\t<open_order_volume>" + str( int(math.ceil(float(open_order_volume) / 2))) + "</open_order_volume>\n" line8 = "\t\t\t<tigger_ref_spread_price>" + str( trigger_ref_spread_price) + "</tigger_ref_spread_price>\n" line9 = "\t\t\t<tigger_price_range>" + str( int(trigger_price_range)) + "</tigger_price_range>\n" line91 = "\t\t\t<price_multiple>" + str( float(tick) * 3) + "</price_multiple>\n" line93 = "\t\t\t<max_open_order_volume>" + str( int(max_open_order_volume)) + "</max_open_order_volume>\n" line_add1 = "\t\t\t<open_over_take>" + str( float(tick) * 3) + "</open_over_take>\n" print>> reach_limit_price_result_file, line3, line4, line6, line7, line71, line8, line9, line91, line_add1, line_add2, line92, \ line93, line_add3, line_add4, line_add5, line_add6, line_add7, line_add8, line_add9, line_add10,line_add11, line10 reach_limit_price_result_file.flush() if variety_id in three_variety_list: second_instrument = forth_instrument trigger_price_range = trigger_price_range_dict[variety_id][2] trigger_ref_spread_price = (float(quote_map[main_instrument].Bid_Price1) + float(quote_map[main_instrument].Ask_Price1)) / 2 - \ (float(quote_map[second_instrument].Bid_Price1) + float(quote_map[second_instrument].Ask_Price1)) / 2 ref_spread_last_price = float( quote_map[main_instrument].Last_Price) - float( quote_map[second_instrument].Last_Price) bid_ask_spread = abs(ref_spread_last_price - trigger_ref_spread_price) / tick if bid_ask_spread < 15 and quote_map[ second_instrument].Update_Time > close_limit_update_time: line6 = "\t\t\t<sub_instrument>" + second_instrument + "</sub_instrument>\n" line7 = "\t\t\t<open_order_volume>" + str( int(math.ceil(float(open_order_volume) / 3))) + "</open_order_volume>\n" line8 = "\t\t\t<tigger_ref_spread_price>" + str( trigger_ref_spread_price) + "</tigger_ref_spread_price>\n" line9 = "\t\t\t<tigger_price_range>" + str( int(trigger_price_range)) + "</tigger_price_range>\n" line91 = "\t\t\t<price_multiple>" + str( float(tick) * 5) + "</price_multiple>\n" line93 = "\t\t\t<max_open_order_volume>" + str( int(max_open_order_volume)) + "</max_open_order_volume>\n" line_add1 = "\t\t\t<open_over_take>" + str( float(tick) * 5) + "</open_over_take>\n" print>> reach_limit_price_result_file, line3, line4, line6, line7, line71, line8, line9, line91, line_add1, line_add2, line92, \ line93, line_add3, line_add4, line_add5, line_add6, line_add7, line_add8, line_add9, line_add10,line_add11, line10 reach_limit_price_result_file.flush() if variety_id in forth_variety_list: second_instrument = fifth_instrument trigger_price_range = trigger_price_range_dict[variety_id][3] trigger_ref_spread_price = (float(quote_map[main_instrument].Bid_Price1) + float(quote_map[main_instrument].Ask_Price1)) / 2 - \ (float(quote_map[second_instrument].Bid_Price1) + float(quote_map[second_instrument].Ask_Price1)) / 2 ref_spread_last_price = float( quote_map[main_instrument].Last_Price) - float( quote_map[second_instrument].Last_Price) bid_ask_spread = abs(ref_spread_last_price - trigger_ref_spread_price) / tick if bid_ask_spread < 15 and quote_map[ second_instrument].Update_Time > close_limit_update_time: line6 = "\t\t\t<sub_instrument>" + second_instrument + "</sub_instrument>\n" line7 = "\t\t\t<open_order_volume>" + str( int(math.ceil(float(open_order_volume) / 5))) + "</open_order_volume>\n" line8 = "\t\t\t<tigger_ref_spread_price>" + str( trigger_ref_spread_price) + "</tigger_ref_spread_price>\n" line9 = "\t\t\t<tigger_price_range>" + str( int(trigger_price_range)) + "</tigger_price_range>\n" line91 = "\t\t\t<price_multiple>" + str( float(tick) * 8) + "</price_multiple>\n" line93 = "\t\t\t<max_open_order_volume>" + str( int(max_open_order_volume)) + "</max_open_order_volume>\n" line_add1 = "\t\t\t<open_over_take>" + str( float(tick) * 5) + "</open_over_take>\n" print>> reach_limit_price_result_file, line3, line4, line6, line7, line71, line8, line9, line91, line_add1, line_add2, line92, \ line93, line_add3, line_add4, line_add5, line_add6, line_add7, line_add8, line_add9, line_add10,line_add11, line10 reach_limit_price_result_file.flush() if variety_id in fifth_variety_list: ref_spread_last_price = float( quote_map[main_instrument].Last_Price) - float( quote_map[second_instrument].Last_Price) bid_ask_spread = abs(ref_spread_last_price - trigger_ref_spread_price) / tick if bid_ask_spread < 15 and quote_map[ second_instrument].Update_Time > close_limit_update_time: second_instrument = sixth_instrument trigger_price_range = trigger_price_range_dict[variety_id][4] trigger_ref_spread_price = (float(quote_map[main_instrument].Bid_Price1) + float(quote_map[main_instrument].Ask_Price1)) / 2 - \ (float(quote_map[second_instrument].Bid_Price1) + float(quote_map[second_instrument].Ask_Price1)) / 2 line6 = "\t\t\t<sub_instrument>" + second_instrument + "</sub_instrument>\n" line7 = "\t\t\t<open_order_volume>" + str( int(math.ceil(float(open_order_volume) / 5))) + "</open_order_volume>\n" line8 = "\t\t\t<tigger_ref_spread_price>" + str( trigger_ref_spread_price) + "</tigger_ref_spread_price>\n" line9 = "\t\t\t<tigger_price_range>" + str( int(trigger_price_range)) + "</tigger_price_range>\n" line91 = "\t\t\t<price_multiple>" + str( float(tick) * 10) + "</price_multiple>\n" line93 = "\t\t\t<max_open_order_volume>" + str( int(max_open_order_volume)) + "</max_open_order_volume>\n" line_add1 = "\t\t\t<open_over_take>" + str( float(tick) * 7) + "</open_over_take>\n" print>> reach_limit_price_result_file, line3, line4, line6, line7, line71, line8, line9, line91, line_add1, line_add2, line92, \ line93, line_add3, line_add4, line_add5, line_add6, line_add7, line_add8, line_add9, line_add10,line_add11, line10 reach_limit_price_result_file.flush()
def test_value_counts_inferred(self): klasses = [Index, Series] for klass in klasses: s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.unique(s_values)) self.assertEquals(s.nunique(), 4) # don't sort, have to sort after the fact as not sorting is platform-dep hist = s.value_counts(sort=False) hist.sort() expected = Series([3, 1, 4, 2], index=list('acbd')) expected.sort() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) expected = Series([1, 2, 3, 4], index=list('cdab')) tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(hist, expected) # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({0.998: 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3])) self.assertEquals(s1.nunique(), 3) res4 = s1.value_counts(bins=4) exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O')) self.assertEquals(s.nunique(), 3) s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.array([])) self.assertEquals(s.nunique(), 0) # GH 3002, datetime64[ns] txt = "\n".join(['xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG', 'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM']) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) s = klass(df['dt'].copy()) idx = pd.to_datetime(['2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X']) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) expected = np.array(['2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z'], dtype='datetime64[ns]') if isinstance(s, DatetimeIndex): expected = DatetimeIndex(expected) self.assert_(s.unique().equals(expected)) else: self.assert_numpy_array_equal(s.unique(), expected) self.assertEquals(s.nunique(), 3) # with NaT s = df['dt'].copy() s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() self.assertEqual(result.index.dtype, 'datetime64[ns]') expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) unique = s.unique() self.assertEqual(unique.dtype, 'datetime64[ns]') # numpy_array_equal cannot compare pd.NaT self.assert_numpy_array_equal(unique[:3], expected) self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT) self.assertEquals(s.nunique(), 4) # timedelta64[ns] td = df.dt - df.dt + timedelta(1) td = klass(td) result = td.value_counts() expected_s = Series([6], index=[86400000000000]) self.assertEqual(result.index.dtype, 'int64') tm.assert_series_equal(result, expected_s) # get nanoseconds to compare expected = np.array([86400000000000]) self.assert_numpy_array_equal(td.unique(), expected) self.assertEquals(td.nunique(), 1) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2) result2 = td2.value_counts() self.assertEqual(result2.index.dtype, 'int64') tm.assert_series_equal(result2, expected_s) self.assert_numpy_array_equal(td.unique(), expected) self.assertEquals(td.nunique(), 1)
def setCategories(self, Categories: Series): Categories = Categories.unique() Categories.sort() self.__Encoder = Encoder() self.__Encoder.fit(Categories)