def build_spectrum_table(spectrum_file, schema, index=None, **kwargs): """ This function factorises out common code required to auto-populate an ADR Spectrum resource from a PJNZ file. It uses the ADR resource validation schema to build a dataframe and insert into it data from the PJNZ file. IMPORTANT - This function evaluates snippets of code from the JSON schemas. This is not ideal, as the snippets would ideally be brought into the Python ecosystem. However, for the time being it was seen as the cleanest way to store the complex mapping of data from PJNZ to ADR resource. """ # We reference the spectrum file from json schemas - give it a shorthand ref sf = spectrum_file # Remove the first schema field as this is the header/index schema = schema.copy() first_field = schema['fields'].pop(0) # Assemble the populated data file in dictionaries new_table = OrderedDict() for field in schema['fields']: if field.get('spectrum_file_key', False): # Fill row in with spectrum data try: # IMPORTANT - We evaluate a snippet of code from the JSON file data_series = list(eval(field['spectrum_file_key'])) except Exception: logging.error("Failed to evaluate " + field['name'] + " spectrum_file_key: " + field['spectrum_file_key']) raise new_table[field['name']] = data_series else: # If no spectrum_file_key given, then leave series empty new_table[field['name']] = [] # Fill in empty series with NAN (must match other series length) max_length = max([len(x) for x in new_table.values()]) for key, value in new_table.iteritems(): if len(value) == 0: new_table[key] = [np.NaN] * max_length new_table = pd.DataFrame.from_dict(new_table, **kwargs) # Fix the indicies if they are mannually specified if index: new_table.index = index # Fix the indicies if they are specified with a spectrum_file_key elif first_field.get('spectrum_file_key', False): new_table.index = list(eval(first_field['spectrum_file_key'])) new_table.insert(0, first_field['name'], new_table.index) return new_table
def setSelectSampleTextIndex( thisFont, tab=None, marker="### CUSTOM KERN STRING ###"): if Glyphs.versionNumber >= 3: # Glyphs 3 code sampleTexts = OrderedDict([(d['name'], d['text']) for d in Glyphs.defaults["SampleTextsList"]]) foundSampleString = False for sampleTextIndex, k in enumerate(sampleTexts.keys()): if marker in k: foundSampleString = True if not tab: tab = thisFont.currentTab if not tab: tab = thisFont.newTab() tab.selectSampleTextArrayController().setSelectionIndex_(sampleTextIndex+1) tab.text = sampleTexts[sampleTexts.keys()[sampleTextIndex+1]] break if not foundSampleString: print("Warning: Could not find '%s' in sample strings." % marker) else: # Glyphs 2 code sampleTexts = tuple(Glyphs.defaults["SampleTexts"]) sampleTextIndex = sampleTexts.index(marker) if sampleTextIndex > -1: if not tab: tab = thisFont.currentTab if not tab: tab = thisFont.newTab() tab.selectSampleTextArrayController().setSelectionIndex_(sampleTextIndex+1) tab.text = sampleTexts[sampleTextIndex+1] else: print("Warning: Could not find '%s' in sample strings." % marker)
def export_silo(request, id): silo_name = Silo.objects.get(id=id).name response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="%s.csv"' % silo_name writer = csv.writer(response) silo_data = LabelValueStore.objects(silo_id=id) data = [] num_cols = 0 cols = OrderedDict() if silo_data: num_rows = len(silo_data) for row in silo_data: for i, col in enumerate(row): if col not in cols.keys(): num_cols = num_cols + 1 cols[col] = num_cols # Convert OrderedDict to Python list so that it can be written to CSV writer. cols = list(cols) writer.writerow(list(cols)) # Populate a 2x2 list structure that corresponds to the number of rows and cols in silo_data for i in xrange(num_rows): data += [[0]*num_cols] for r, row in enumerate(silo_data): for col in row: # Map values to column names and place them in the correct position in the data array data[r][cols.index(col)] = row[col] writer.writerow(data[r]) return response
def ranking(self): content_a = [word.strip() for word in open(self.wordset_a)] content_b = [word.strip() for word in open(self.wordset_b)] result_matrix = self.result_matrix.todense() truth_matrix = self.truth_matrix.todense() row = 0 targets = [] rankings = [] result_word_list = [] truth_word_list = [] for i in content_a: targets.append(i) column = 0 result_dict = {} truth_dict = {} for j in content_b: result_dict[str(j)] = result_matrix[row, column] truth_dict[str(j)] = truth_matrix[row, column] column += 1 result_sort = OrderedDict( reversed( sorted(result_dict.items(), key=lambda t: np.float(t[1])))).keys() truth_sort = OrderedDict( reversed( sorted(truth_dict.items(), key=lambda t: np.float(t[1])))).keys() result_words = [] truth_words = [] iteration = 0 rank = 0 rank_count = 0 tr_rank = 0 for l in range(0, 10): result_words.append(result_sort[l]) truth_words.append(truth_sort[l]) rank_count += (result_sort.index(truth_sort[l]) + 1) iteration += 1 tr_rank += iteration rank = float(rank_count / 10.0) reference = float(tr_rank / 10.0) result_word_list.append(result_words) truth_word_list.append(truth_words) rankings.append(rank) row += 1 avg_rank = (float(sum(rankings) / len(rankings))) return reference, avg_rank, rankings, result_word_list, truth_word_list, targets
def ranking(self): content_a = [word.strip() for word in open(self.wordset_a)] content_b = [word.strip() for word in open(self.wordset_b)] result_matrix = self.result_matrix.todense() truth_matrix = self.truth_matrix.todense() row = 0 targets = [] rankings = [] result_word_list = [] truth_word_list = [] for i in content_a: targets.append(i) column = 0 result_dict = {} truth_dict = {} for j in content_b: result_dict[str(j)] = result_matrix[row, column] truth_dict[str(j)] = truth_matrix[row, column] column += 1 result_sort = OrderedDict(reversed(sorted(result_dict.items(), key=lambda t: np.float(t[1])))).keys() truth_sort = OrderedDict(reversed(sorted(truth_dict.items(), key=lambda t: np.float(t[1])))).keys() result_words = [] truth_words = [] iteration = 0 rank = 0 rank_count = 0 tr_rank = 0 for l in range(0, 10): result_words.append(result_sort[l]) truth_words.append(truth_sort[l]) rank_count += (result_sort.index(truth_sort[l]) + 1) iteration += 1 tr_rank += iteration rank = float(rank_count / 10.0) reference = float(tr_rank / 10.0) result_word_list.append(result_words) truth_word_list.append(truth_words) rankings.append(rank) row += 1 avg_rank = (float(sum(rankings) / len(rankings))) return reference, avg_rank, rankings, result_word_list, truth_word_list, targets
def test_parse_keywords(): info = [ { 'recscope': 'variable', 'units': 'none', 'name': 'cparms_sg000', 'defval': 'compress Rice', 'note': '', 'type': 'string', }, { 'recscope': 'variable', 'units': 'none', 'name': 'mean_bzero', 'defval': '0', 'note': '', 'type': 'double', }, { 'recscope': 'variable', 'units': 'none', 'name': 'mean_bscale', 'defval': '0.25', 'note': '', 'type': 'double', }, { 'recscope': 'variable', 'units': 'TAI', 'name': 'MidTime', 'defval': '-4712.01.01_11:59_TAI', 'note': 'Midpoint of averaging interval', 'type': 'time', }, ] exp = OrderedDict([ ('name', ['cparms_sg000', 'mean_bzero', 'mean_bscale', 'MidTime']), ('type', ['string', 'double', 'double', 'time']), ('recscope', ['variable', 'variable', 'variable', 'variable']), ('defval', ['compress Rice', '0', '0.25', '-4712.01.01_11:59_TAI']), ('units', ['none', 'none', 'none', 'TAI']), ('note', ['', '', '', 'Midpoint of averaging interval']), ('linkinfo', [None, None, None, None]), ('is_time', [False, False, False, True]), ('is_integer', [False, False, False, False]), ('is_real', [False, True, True, False]), ('is_numeric', [False, True, True, False]), ]) exp = pd.DataFrame(data=exp) exp.index = exp.pop('name') assert drms.SeriesInfo._parse_keywords(info).equals(exp)
def merge(self, *args): ''' Merge some continuous and ascending labels of a tensor into a new one with an optional permutation. Usage: ``tensor.merge((olds,new,<permutation>),(olds,new,<permutation>),...)`` * olds: list of Label/int The old labels/axes to be merged. * new: Label The new label. * permutation: 1d ndarray of int, optional The permutation of the quantum number collection of the new label. Returns ------- DTensor The new tensor. ''' permutations = {} keep = OrderedDict([(i, i) for i in xrange(self.ndim)]) labels = OrderedDict([(i, label) for i, label in enumerate(self.labels)]) for arg in args: assert len(arg) in (2, 3) olds, new, permutation = (arg[0], arg[1], None) if len(arg) == 2 else arg axes = np.array([ self.axis(old) if isinstance(old, Label) else old for old in olds ]) if len(axes) != max(axes) - min(axes) + 1 or not all( axes[1:] > axes[:-1]): raise ValueError( 'DTensor merge error: the axes to be merged should be continuous and ascending, please call transpose first.' ) permutations[new] = permutation keep[axes[0]] = slice(axes[0], axes[-1] + 1) labels[axes[0]] = new for axis in axes[1:]: keep.pop(axis) labels.pop(axis) data = self.data.reshape( tuple( np.product(self.data.shape[ax] ) if isinstance(ax, slice) else self.data.shape[ax] for ax in keep.itervalues())) labels = labels.values() for label, permutation in permutations.iteritems(): data = hm.reorder(data, axes=[labels.index(label)], permutation=permutation) return DTensor(data, labels=labels)
def initial_rank_based_on_quality(graph): # sorted_quality: The result of the rank for all nodes at initial stage quality_dict = {} for i in range(30): quality_dict[i] = graph.node[i]['quality'] sorted_quality = OrderedDict( sorted(quality_dict.items(), key=lambda x: x[1])) sorted_quality = list(sorted_quality) # Convert sorted_quality to initial rank for each node w.r.t their quality quality_rank_list = [] for i in range(30): quality_rank_list.append(sorted_quality.index(i)) return quality_rank_list
def export_silo(request, id): silo_name = Silo.objects.get(id=id).name response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="%s.csv"' % silo_name writer = csv.writer(response) # Loads the bson objects from mongo bsondata = store.find({"silo_id": int(id)}) # Now convert bson to json string using OrderedDict to main fields order json_string = dumps(bsondata) # Now decode the json string into python object silo_data = json.loads(json_string, object_pairs_hook=OrderedDict) data = [] num_cols = 0 cols = OrderedDict() if silo_data: num_rows = len(silo_data) for row in silo_data: for i, col in enumerate(row): if col not in cols.keys(): num_cols += 1 col = col.decode("latin-1").encode("utf8") cols[col] = num_cols # Convert OrderedDict to Python list so that it can be written to CSV writer. cols = list(cols) writer.writerow(list(cols)) # Populate a 2x2 list structure that corresponds to the number of rows and cols in silo_data for i in xrange(num_rows): data += [[0]*num_cols] for r, row in enumerate(silo_data): for col in row: # Map values to column names and place them in the correct position in the data array val = row[col] if isinstance(val, OrderedDict): val = val.popitem() if isinstance(val, tuple): if val[0] == "$date": val = smart_text(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(val[1]/1000))) if val[0] == "$oid": val = smart_text(val[1]) #val = val.decode("latin-1").encode("utf8") val = smart_text(val).decode("latin-1").encode("utf8") data[r][cols.index(col)] = val writer.writerow(data[r]) return response
def convertToMatrix(userls, filesList, census): '''Assumes "userls" as a list of followers, "filesList" is a list of file paths, "census" is a list of popular users. Returns a large matrix which rows are census and columns are userlist.''' y = OrderedDict() zeros = list() for i, j in enumerate(filesList): followers = list(pyreadr.read_r(j)['followers']['followers']) y[i] = np.isin(userls, followers) * 1 if sum(y[i]) == 0: print(census[i], "is followed by 0 users!") zeros.append(census[i]) y = pd.DataFrame.from_dict(y) y.index = userls y.columns = census y = y.drop(columns=zeros) ## drop census who is followed by zero users return (y)
def parse_lines(lines): """Given a nested line list create a flat list of lines and associated mapping matrix """ #flatten linelist and MAKE IT UNIQUE lines_flat = OrderedDict() for line_components in lines: for l in line_components: lines_flat[l] = None lines_flat = tuple(lines_flat.keys()) mapping = np.zeros([len(lines), len(lines_flat)], dtype=float) for i_out, line_components in enumerate(lines): for l in line_components: j_in = lines_flat.index(l) mapping[i_out, j_in] = 1. return lines_flat, mapping
def widgets(self): """Display widgets for all parameters (i.e. property sheet)""" # order by param precedence, but with name first and persist last params = self.parameterized.params().items() ordered_params = OrderedDict(sorted(params, key=lambda x: x[1].precedence)).keys() ordered_params.insert(0,ordered_params.pop(ordered_params.index('name'))) widgets = [self.widget(pname) for pname in ordered_params] button = None if self.p.onchange: pass elif self.blocked: button = 'Run %s' % self.p.execute elif self.p.callback: button = 'Execute' if button: display_button = ipywidgets.Button(description=button) display_button.on_click(self.execute_widget) widgets.append(display_button) return widgets
def test_parse_segments(): segments = [ { 'type': 'int', 'dims': 'VARxVAR', 'units': 'Gauss', 'protocol': 'fits', 'note': 'magnetogram', 'name': 'magnetogram', }, { 'type': 'char', 'dims': 'VARxVAR', 'units': 'Enumerated', 'protocol': 'fits', 'note': 'Mask for the patch', 'name': 'bitmap', }, { 'type': 'int', 'dims': 'VARxVAR', 'units': 'm/s', 'protocol': 'fits', 'note': 'Dopplergram', 'name': 'Dopplergram', }, ] exp = OrderedDict([ ('name', ['magnetogram', 'bitmap', 'Dopplergram']), ('type', ['int', 'char', 'int']), ('units', ['Gauss', 'Enumerated', 'm/s']), ('protocol', ['fits', 'fits', 'fits']), ('dims', ['VARxVAR', 'VARxVAR', 'VARxVAR']), ('note', ['magnetogram', 'Mask for the patch', 'Dopplergram']), ]) exp = pd.DataFrame(data=exp) exp.index = exp.pop('name') assert drms.SeriesInfo._parse_segments(segments).equals(exp)
class PsParser: """ parse ps output """ def __init__(self, command="ps -Al"): output = subprocess.check_output(command, shell=True) self.result = {} self.headers = OrderedDict() processes = output.splitlines() nfields = len(processes[0].split()) - 1 self.lines = [] for k, row in enumerate(processes): row = row.decode('utf-8') data = row.split(None, nfields) if k==0: self.headers = data for k,header in enumerate(data): self.result[header] = [] continue self.lines.append(data) for k,value in enumerate(data): self.result[self.headers[k]].append(value) def get_pid(self, pid): result = [Proc(k, self) for k in self.lines if int(k[self.headers.index('PID')])==pid] return result[0] if result else None
def merge(self,*args): ''' Merge some continuous and ascending labels of a tensor into a new one with an optional permutation. Usage: ``tensor.merge((olds,new,<permutation>),(olds,new,<permutation>),...)`` * olds: list of Label/int The old labels/axes to be merged. * new: Label The new label. * permutation: 1d ndarray of int, optional The permutation of the quantum number collection of the new label. Returns ------- DTensor The new tensor. ''' permutations={} keep=OrderedDict([(i,i) for i in range(self.ndim)]) labels=OrderedDict([(i,label) for i,label in enumerate(self.labels)]) for arg in args: assert len(arg) in (2,3) olds,new,permutation=(arg[0],arg[1],None) if len(arg)==2 else arg axes=np.array([self.axis(old) if isinstance(old,Label) else old for old in olds]) if len(axes)!=max(axes)-min(axes)+1 or not all(axes[1:]>axes[:-1]): raise ValueError('DTensor merge error: the axes to be merged should be continuous and ascending, please call transpose first.') permutations[new]=permutation keep[axes[0]]=slice(axes[0],axes[-1]+1) labels[axes[0]]=new for axis in axes[1:]: keep.pop(axis) labels.pop(axis) data=self.data.reshape(tuple(np.product(self.data.shape[ax]) if isinstance(ax,slice) else self.data.shape[ax] for ax in keep.values())) labels=list(labels.values()) for label,permutation in permutations.items(): data=hm.reorder(data,axes=[labels.index(label)],permutation=permutation) return DTensor(data,labels=labels)
def test_parse_links(): links = [ { 'name': 'BHARP', 'kind': 'DYNAMIC', 'note': 'Bharp', 'target': 'hmi.Bharp_720s' }, { 'name': 'MHARP', 'kind': 'DYNAMIC', 'note': 'Mharp', 'target': 'hmi.Mharp_720s' }, ] exp = OrderedDict([ ('name', ['BHARP', 'MHARP']), ('target', ['hmi.Bharp_720s', 'hmi.Mharp_720s']), ('kind', ['DYNAMIC', 'DYNAMIC']), ('note', ['Bharp', 'Mharp']), ]) exp = pd.DataFrame(data=exp) exp.index = exp.pop('name') assert drms.SeriesInfo._parse_links(links).equals(exp)
class LiteratureExtension(Extension): """ Literature Extension. """ def __init__(self, *args, **kwargs): """ Setup configs. """ self.config = { 'PLACE_MARKER': ["///Literature Goes Here///", "The text string that marks where the literature references go"], 'UNIQUE_IDS': [False, "Avoid name collisions across " "multiple calls to reset()."], "BACKLINK_TEXT": ["↩", "The text string that links from the literature reference " "to the reader's place."] } super().__init__(*args, **kwargs) # In multiple invocations, emit links that don't get tangled. self.unique_prefix = 0 self.found_refs = {} self.used_refs = set() self.reset() def extendMarkdown(self, md): """ Add pieces to Markdown. """ md.registerExtension(self) self.parser = md.parser self.md = md # Insert a preprocessor before ReferencePreprocessor md.preprocessors.register(LiteraturePreprocessor(self), "literature", 1) # Insert an inline pattern before ImageReferencePattern LITERATURE_RE = r'\[\=([^\]]*)\]' # blah blah [^1] blah md.inlinePatterns.register(LiteraturePattern(LITERATURE_RE, self), "literature", 80) # Insert a tree-processor that would actually add the literatures div # This must be before all other treeprocessors (i.e., inline and # codehilite) so they can run on the the contents of the div. md.treeprocessors.register(LiteratureTreeprocessor(self), "literature", 100) # Insert a postprocessor after amp_substitute oricessor md.postprocessors.register(LiteraturePostprocessor(self), "literature", 120) def reset(self): """ Clear literature references on reset, and prepare for distinct document. """ self.literatures = OrderedDict() self.unique_prefix += 1 def findLiteraturesPlaceholder(self, root): """ Return ElementTree Element that contains Literature placeholder. """ def finder(element): for child in element: if child.text: if child.text.find(self.getConfig("PLACE_MARKER")) > -1: return child, element, True if child.tail: if child.tail.find(self.getConfig("PLACE_MARKER")) > -1: return child, element, False child_res = finder(child) if child_res is not None: return child_res return None res = finder(root) return res def setLiterature(self, identifier, text): """ Store a literature for later retrieval. """ self.literatures[identifier] = text def get_separator(self): if self.md.output_format in ['html5', 'xhtml5']: return '-' return ':' def makeLiteratureId(self, identifier): """ Return literature link identifier. """ if self.getConfig("UNIQUE_IDS"): return 'lit%s%d-%s' % (self.get_separator(), self.unique_prefix, identifier) else: return 'lit%s%s' % (self.get_separator(), identifier) def makeLiteratureRefId(self, identifier): """ Return literature back-link identifier. """ if self.getConfig("UNIQUE_IDS"): return 'litref%s%d-%s' % (self.get_separator(), self.unique_prefix, identifier) else: return 'litref%s%s' % (self.get_separator(), identifier) def makeLiteraturesDiv(self, root): """ Return div of literatures as et Element. """ if not list(self.literatures.keys()): return None div = etree.Element("div") div.set('class', 'literature') etree.SubElement(div, "hr") ol = etree.SubElement(div, "ol") for identifier in self.literatures.keys(): li = etree.SubElement(ol, "li") li.set("id", self.makeLiteratureId(identifier)) self.parser.parseChunk(li, self.literatures[identifier]) backlink = etree.Element("a") backlink.set("href", "#" + self.makeLiteratureRefId(identifier)) if self.md.output_format not in ['html5', 'xhtml5']: backlink.set("rev", "literature") # Invalid in HTML5 backlink.set("class", "literature-backref") backlink.set( "title", "Jump back to literature %d in the text" % (self.literatures.index(identifier)+1) ) backlink.text = LIT_BACKLINK_TEXT if list(li): node = li[-1] if node.tag == "p": node.text = node.text + NBSP_PLACEHOLDER node.append(backlink) else: p = etree.SubElement(li, "p") p.append(backlink) return div
'laplace': 'Laplace', 'binomial': 'Binomial' } cols = { 'sfmt': 'SFMT', 'dsfmt': 'dSFMT', 'xoroshiro128plus': 'xoroshiro128+', 'xorshift1024': 'xorshift1024', 'pcg64': 'PCG64', 'mt19937': 'MT19937', 'random': 'NumPy MT19937' } results.columns = [cols[c] for c in results] results.index = [index[i] for i in results.index] print(results) from io import StringIO sio = StringIO() results.to_csv(sio) sio.seek(0) lines = sio.readlines() for i, line in enumerate(lines): if i == 0: line = ' :header: ' + line else: line = ' ' + line lines[i] = line
def pmultiquery(corpus, search, show = 'words', query = 'any', sort_by = 'total', quicksave = False, multiprocess = 'default', function_filter = False, just_speakers = False, root = False, note = False, print_info = True, **kwargs): """Parallel process multiple queries or corpora. This function is used by interrogator() if: a) path is a list of paths b) query is a dict of named queries c) just speakers == 'each', or a list of speakers with len(list) > 1 This function needs joblib 0.8.4 or above in order to run properly. There's no reason to call it yourself.""" import collections import os import pandas as pd import collections from collections import namedtuple from time import strftime, localtime import corpkit from interrogator import interrogator from editor import editor from other import save from interrogation import Interrogation try: from joblib import Parallel, delayed except: pass #raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \ # 'Install with:\n\n pip install joblib') import multiprocessing def best_num_parallel(num_cores, num_queries): import corpkit """decide how many parallel processes to run the idea, more or less, is to balance the load when possible""" if num_queries <= num_cores: return num_queries if num_queries > num_cores: if (num_queries / num_cores) == num_cores: return int(num_cores) if num_queries % num_cores == 0: try: return max([int(num_queries / n) for n in range(2, num_cores) if int(num_queries / n) <= num_cores]) except ValueError: return num_cores else: import math if (float(math.sqrt(num_queries))).is_integer(): square_root = math.sqrt(num_queries) if square_root <= num_queries / num_cores: return int(square_root) return num_cores num_cores = multiprocessing.cpu_count() # what is our iterable? ... multiple_option = False multiple_queries = False multiple_speakers = False multiple_corpora = False multiple_search = False mult_corp_are_subs = False denom = 1 if hasattr(corpus, '__iter__'): multiple_corpora = True num_cores = best_num_parallel(num_cores, len(corpus)) denom = len(corpus) if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus): mult_corp_are_subs = True elif (type(query) == list or type(query) == dict) and not hasattr(search, '__iter__'): multiple_queries = True num_cores = best_num_parallel(num_cores, len(query)) denom = len(query) elif hasattr(search, '__iter__') and type(search) != dict: multiple_search = True num_cores = best_num_parallel(num_cores, len(list(search.keys()))) denom = len(list(search.keys())) elif hasattr(function_filter, '__iter__'): multiple_option = True num_cores = best_num_parallel(num_cores, len(list(function_filter.keys()))) denom = len(list(function_filter.keys())) elif just_speakers: from build import get_speaker_names_from_xml_corpus multiple_speakers = True if just_speakers == 'each' or just_speakers == ['each']: just_speakers = get_speaker_names_from_xml_corpus(corpus.path) if len(just_speakers) == 0: print('No speaker name data found.') return num_cores = best_num_parallel(num_cores, len(just_speakers)) denom = len(just_speakers) if type(multiprocess) == int: num_cores = multiprocess if multiprocess is False: num_cores = 1 # make sure quicksaves are right type if quicksave is True: raise ValueError('quicksave must be string when using pmultiquery.') # the options that don't change d = { #'paralleling': True, 'function': 'interrogator', 'root': root, 'note': note, 'denominator': denom} # add kwargs to query for k, v in list(kwargs.items()): d[k] = v # make a list of dicts to pass to interrogator, # with the iterable unique in every one ds = [] if multiple_corpora: for index, p in enumerate(corpus): name = p.name a_dict = dict(d) a_dict['corpus'] = p a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name.replace('-parsed', '') a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_queries: for index, (name, q) in enumerate(query.items()): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = q a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_option: for index, (name, q) in enumerate(function_filter.items()): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['function_filter'] = q a_dict['printstatus'] = False ds.append(a_dict) elif multiple_speakers: for index, name in enumerate(just_speakers): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = [name] a_dict['function_filter'] = function_filter a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_search: for index, val in enumerate(search): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = val a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['function_filter'] = function_filter a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) if kwargs.get('do_concordancing') is False: message = 'Interrogating' elif kwargs.get('do_concordancing') is True: message = 'Interrogating and concordancing' elif kwargs.get('do_concordancing').lower() == 'only': message = 'Concordancing' time = strftime("%H:%M:%S", localtime()) sformat = '' for i, (k, v) in enumerate(list(search.items())): if type(v) == list: vformat = ', '.join(v[:5]) if len(v) > 5: vformat += ' ...' else: vformat = v sformat += '%s: %s' %(k, vformat) if i < len(search.keys()) - 1: sformat += '\n ' if multiple_corpora and not multiple_option: corplist = "\n ".join([i.name for i in corpus[:20]]) if len(corpus) > 20: corplist += '\n ... and %d more ...\n' % (len(corpus) - 20) print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes):\n %s" \ "\n Query: '%s'\n %s corpus ... \n" % (time, len(corpus), num_cores, corplist, sformat, message))) elif multiple_queries: print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \ "\n Queries: '%s'\n %s corpus ... \n" % (time, len(search), num_cores, corpus.name, "', '".join(list(search.values())), message) )) elif multiple_search: print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \ "\n Queries: '%s'\n %s corpus ... \n" % (time, len(list(search.keys())), num_cores, corpus.name, str(list(search.values())), message))) elif multiple_option: print(("\n%s: Beginning %d parallel corpus interrogations (multiple options): %s" \ "\n Query: '%s'\n %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) )) elif multiple_speakers: print(("\n%s: Beginning %d parallel corpus interrogations: %s" \ "\n Query: '%s'\n %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) )) # run in parallel, get either a list of tuples (non-c option) # or a dataframe (c option) #import sys #reload(sys) #stdout=sys.stdout failed = False terminal = False used_joblib = False #ds = ds[::-1] if not root: from blessings import Terminal terminal = Terminal() print('\n' * (len(ds) - 2)) for dobj in ds: linenum = dobj['paralleling'] # this try handles nosetest problems in sublime text try: with terminal.location(0, terminal.height - (linenum + 1)): # this is a really bad idea. thetime = strftime("%H:%M:%S", localtime()) num_spaces = 26 - len(dobj['outname']) print('%s: QUEUED: %s' % (thetime, dobj['outname'])) except: pass if not root and multiprocess: #res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) try: #ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True) res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) used_joblib = True except: failed = True print('Multiprocessing failed.') raise if not res: failed = True else: res = [] for index, d in enumerate(ds): d['startnum'] = (100 / denom) * index res.append(interrogator(**d)) try: res = sorted(res) except: pass # multiprocessing way #from multiprocessing import Process #from interrogator import interrogator #jobs = [] ##for d in ds: ## p = multiprocessing.Process(target=interrogator, kwargs=(**d,)) ## jobs.append(p) ## p.start() ## while p.is_alive(): ## import time ## time.sleep(2) ## if root: ## root.update() #result_queue = multiprocessing.Queue() # #for d in ds: #funs = [interrogator(result_queue, **kwargs) for kwargs in ds] #jobs = [multiprocessing.Process(mc) for mc in funs] #for job in jobs: job.start() #for job in jobs: job.join() #results = [result_queue.get() for mc in funs] import corpkit from interrogation import Concordance if kwargs.get('do_concordancing') == 'only': concs = pd.concat([x for x in res]) thetime = strftime("%H:%M:%S", localtime()) print('\n\n%s: Finished! %d results.\n\n' % (thetime, len(concs.index))) return Concordance(concs) from collections import OrderedDict if not all(type(i.results) == pd.core.series.Series for i in res): out = OrderedDict() for interrog, d in zip(res, ds): for unpicklable in ['note', 'root']: interrog.query.pop(unpicklable, None) out[interrog.query['outname']] = interrog if quicksave: fullpath = os.path.join('saved_interrogations', quicksave) while os.path.isdir(fullpath): selection = input("\nSave error: %s already exists in %s.\n\nType 'o' to overwrite, or enter a new name: " % (quicksave, 'saved_interrogations')) if selection == 'o' or selection == 'O': import shutil shutil.rmtree(fullpath) else: import os fullpath = os.path.join('saved_interrogations', selection) for k, v in list(out.items()): save(v, k, savedir = fullpath, print_info = False) time = strftime("%H:%M:%S", localtime()) print("\n%s: %d files saved to %s" % ( time, len(list(out.keys())), fullpath)) time = strftime("%H:%M:%S", localtime()) print("\n\n%s: Finished! Output is a dictionary with keys:\n\n '%s'\n" % (time, "'\n '".join(sorted(out.keys())))) from interrogation import Interrodict return Interrodict(out) # make query and total branch, save, return else: #print sers #print ds if multiple_corpora and not mult_corp_are_subs: sers = [i.results for i in res] out = pd.DataFrame(sers, index = [i.query['outname'] for i in res]) out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols out = out.fillna(0) # nan to zero out = out.astype(int) # float to int out = out.T else: out = pd.concat([r.results for r in res], axis = 1) # format like normal out = out[sorted(list(out.columns))] out = out.T out = out.fillna(0) # nan to zero out = out.astype(int) if 'c' in show and mult_corp_are_subs: out = out.sum() out.index = sorted(list(out.index)) # sort by total if type(out) == pd.core.frame.DataFrame: out.ix['Total-tmp'] = out.sum() tot = out.ix['Total-tmp'] out = out[tot.argsort()[::-1]] out = out.drop('Total-tmp', axis = 0) out = out.edit(sort_by = sort_by, print_info = False, keep_stats = False, \ df1_always_df = kwargs.get('df1_always_df')) if len(out.results.columns) == 1: out.results = out.results.sort_index() if kwargs.get('do_concordancing') is True: concs = pd.concat([x.concordance for x in res], ignore_index = True) concs = concs.sort_values(by='c') concs = concs.reset_index(drop=True) out.concordance = Concordance(concs) thetime = strftime("%H:%M:%S", localtime()) if terminal: with terminal.location(0, terminal.height): print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n')) else: print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n')) #if used_joblib: if quicksave: from other import save save(out, quicksave) print('\n') return out
def assign_variables(assignment_expressions, df, locals_dict, df_alias=None, trace_rows=None): """ Evaluate a set of variable expressions from a spec in the context of a given data table. Expressions are evaluated using Python's eval function. Python expressions have access to variables in locals_d (and df being accessible as variable df.) They also have access to previously assigned targets as the assigned target name. lowercase variables starting with underscore are temp variables (e.g. _local_var) and not returned except in trace_results uppercase variables starting with underscore are temp scalar variables (e.g. _LOCAL_SCALAR) and not returned except in trace_assigned_locals This is useful for defining general purpose local constants in expression file Users should take care that expressions (other than temp scalar variables) should result in a Pandas Series (scalars will be automatically promoted to series.) Parameters ---------- assignment_expressions : pandas.DataFrame of target assignment expressions target: target column names expression: pandas or python expression to evaluate df : pandas.DataFrame locals_d : Dict This is a dictionary of local variables that will be the environment for an evaluation of "python" expression. trace_rows: series or array of bools to use as mask to select target rows to trace Returns ------- variables : pandas.DataFrame Will have the index of `df` and columns named by target and containing the result of evaluating expression trace_df : pandas.DataFrame or None a dataframe containing the eval result values for each assignment expression """ np_logger = NumpyLogger(logger) def is_throwaway(target): return target == '_' def is_temp_scalar(target): return target.startswith('_') and target.isupper() def is_temp(target): return target.startswith('_') def to_series(x): if x is None or np.isscalar(x): return pd.Series([x] * len(df.index), index=df.index) return x assert assignment_expressions.shape[0] > 0 trace_assigned_locals = trace_results = None if trace_rows is not None: # convert to numpy array so we can slice ndarrays as well as series trace_rows = np.asanyarray(trace_rows) if trace_rows.any(): trace_results = OrderedDict() trace_assigned_locals = OrderedDict() # avoid touching caller's passed-in locals_d parameter (they may be looping) _locals_dict = local_utilities() if locals_dict is not None: _locals_dict.update(locals_dict) if df_alias: _locals_dict[df_alias] = df else: _locals_dict['df'] = df local_keys = list(_locals_dict.keys()) # build a dataframe of eval results for non-temp targets # since we allow targets to be recycled, we want to only keep the last usage variables = OrderedDict() # need to be able to identify which variables causes an error, which keeps # this from being expressed more parsimoniously for e in zip(assignment_expressions.target, assignment_expressions.expression): target, expression = e assert isinstance(target, str), \ "expected target '%s' for expression '%s' to be string not %s" % \ (target, expression, type(target)) if target in local_keys: logger.warning("assign_variables target obscures local_d name '%s'", str(target)) if is_temp_scalar(target) or is_throwaway(target): try: x = eval(expression, globals(), _locals_dict) except Exception as err: logger.error("assign_variables error: %s: %s", type(err).__name__, str(err)) logger.error("assign_variables expression: %s = %s", str(target), str(expression)) raise err if not is_throwaway(target): _locals_dict[target] = x if trace_assigned_locals is not None: trace_assigned_locals[uniquify_key(trace_assigned_locals, target)] = x continue try: # FIXME - log any numpy warnings/errors but don't raise np_logger.target = str(target) np_logger.expression = str(expression) saved_handler = np.seterrcall(np_logger) save_err = np.seterr(all='log') # FIXME should whitelist globals for security? globals_dict = {} expr_values = to_series(eval(expression, globals_dict, _locals_dict)) np.seterr(**save_err) np.seterrcall(saved_handler) except Exception as err: logger.error("assign_variables error: %s: %s", type(err).__name__, str(err)) logger.error("assign_variables expression: %s = %s", str(target), str(expression)) raise err if not is_temp(target): variables[target] = expr_values if trace_results is not None: trace_results[uniquify_key(trace_results, target)] = expr_values[trace_rows] # update locals to allows us to ref previously assigned targets _locals_dict[target] = expr_values if trace_results is not None: trace_results = pd.DataFrame.from_dict(trace_results) trace_results.index = df[trace_rows].index # add df columns to trace_results trace_results = pd.concat([df[trace_rows], trace_results], axis=1) # we stored result in dict - convert to df variables = util.df_from_dict(variables, index=df.index) return variables, trace_results, trace_assigned_locals
def pmultiquery(corpus, search, show='words', query='any', sort_by='total', save=False, multiprocess='default', root=False, note=False, print_info=True, subcorpora=False, **kwargs ): """ - Parallel process multiple queries or corpora. - This function is used by corpkit.interrogator.interrogator() - for multiprocessing. - There's no reason to call this function yourself. """ import os from pandas import DataFrame, Series import pandas as pd import collections from collections import namedtuple, OrderedDict from time import strftime, localtime import corpkit from corpkit.interrogator import interrogator from corpkit.interrogation import Interrogation, Interrodict from corpkit.process import canpickle try: from joblib import Parallel, delayed except ImportError: pass import multiprocessing locs = locals() for k, v in kwargs.items(): locs[k] = v in_notebook = locs.get('in_notebook') def best_num_parallel(num_cores, num_queries): """decide how many parallel processes to run the idea, more or less, is to balance the load when possible""" import corpkit if num_queries <= num_cores: return num_queries if num_queries > num_cores: if (num_queries / num_cores) == num_cores: return int(num_cores) if num_queries % num_cores == 0: try: return max([int(num_queries / n) for n in range(2, num_cores) \ if int(num_queries / n) <= num_cores]) except ValueError: return num_cores else: import math if (float(math.sqrt(num_queries))).is_integer(): square_root = math.sqrt(num_queries) if square_root <= num_queries / num_cores: return int(square_root) return num_cores num_cores = multiprocessing.cpu_count() # what is our iterable? ... multiple = kwargs.get('multiple', False) mult_corp_are_subs = False if hasattr(corpus, '__iter__'): if all(getattr(x, 'level', False) == 's' for x in corpus): mult_corp_are_subs = True non_first_sub = None if subcorpora: non_first_sub = subcorpora[1:] if isinstance(subcorpora, list) else None subval = subcorpora if not non_first_sub else subcorpora[0] #print(subcorpora, non_first_sub, subval) if subcorpora is True: import re subcorpora = re.compile(r'.*') else: # strange travis error happened here subcorpora = corpus.metadata['fields'][subval] if len(subcorpora) == 0: print('No %s metadata found.' % str(subval)) return mapcores = {'datalist': [corpus, 'corpus'], 'multiplecorpora': [corpus, 'corpus'], 'namedqueriessingle': [query, 'query'], 'namedqueriesmultiple': [search, 'search'], 'subcorpora': [subcorpora, 'subcorpora']} # a is a dummy, just to produce default one toiter, itsname = mapcores.get(multiple, [False, False]) if isinstance(toiter, dict): toiter = toiter.items() denom = len(toiter) num_cores = best_num_parallel(num_cores, denom) # todo: code below makes no sense vals = ['eachspeaker', 'multiplespeaker', 'namedqueriesmultiple'] if multiple == 'multiplecorpora' and any(x is True for x in vals): from corpkit.corpus import Corpus, Corpora if isinstance(corpus, Corpora): multiprocess = False else: corpus = Corpus(corpus) if isinstance(multiprocess, int): num_cores = multiprocess if multiprocess is False: num_cores = 1 # make sure saves are right type if save is True: raise ValueError('save must be string when multiprocessing.') # make a list of dicts to pass to interrogator, # with the iterable unique in every one locs['printstatus'] = False locs['multiprocess'] = False locs['df1_always_df'] = False locs['files_as_subcorpora'] = False locs['corpus'] = corpus if multiple == 'multiplespeaker': locs['multispeaker'] = True if isinstance(non_first_sub, list) and len(non_first_sub) == 1: non_first_sub = non_first_sub[0] # make the default query locs = {k: v for k, v in locs.items() if canpickle(v)} # make a new dict for every iteration ds = [dict(**locs) for i in range(denom)] for index, (d, bit) in enumerate(zip(ds, toiter)): d['paralleling'] = index if multiple in ['namedqueriessingle', 'namedqueriesmultiple']: d[itsname] = bit[1] d['outname'] = bit[0] elif multiple in ['multiplecorpora', 'datalist']: d['outname'] = bit.name.replace('-parsed', '') d[itsname] = bit elif multiple in ['subcorpora']: d[itsname] = bit jmd = {subval: bit} # put this earlier j2 = kwargs.get('just_metadata', False) if not j2: j2 = {} jmd.update(j2) d['just_metadata'] = jmd d['outname'] = bit d['by_metadata'] = False d['subcorpora'] = non_first_sub if non_first_sub: d['print_info'] = False # message printer should be a function... if kwargs.get('conc') is False: message = 'Interrogating' elif kwargs.get('conc') is True: message = 'Interrogating and concordancing' elif kwargs.get('conc').lower() == 'only': message = 'Concordancing' time = strftime("%H:%M:%S", localtime()) from corpkit.process import dictformat if print_info: # proper printing for plurals # in truth this needs to be revised, it's horrible. sformat = dictformat(search, query) if num_cores == 1: add_es = '' else: add_es = 'es' if multiple in ['multiplecorpora', 'datalist']: corplist = "\n ".join([i.name for i in list(corpus)[:20]]) if len(corpus) > 20: corplist += '\n ... and %d more ...\n' % (len(corpus) - 20) print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s):\n %s" \ "\n Query: %s\n %s corpus ... \n" % (time, len(corpus), num_cores, add_es, corplist, sformat, message))) elif multiple == 'namedqueriessingle': print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \ "\n Queries: %s\n %s corpus ... \n" % (time, len(query), num_cores, add_es, corpus.name, sformat, message) )) elif multiple == 'namedqueriesmultiple': print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \ "\n Queries: %s\n %s corpus ... \n" % (time, len(list(search.keys())), num_cores, add_es, corpus.name, sformat, message))) elif multiple in ['eachspeaker', 'multiplespeaker']: print(("\n%s: Beginning %d parallel corpus interrogation%s: %s" \ "\n Query: %s\n %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) )) elif multiple in ['subcorpora']: print(("\n%s: Beginning %d parallel corpus interrogation%s: %s" \ "\n Query: %s\n %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) )) # run in parallel, get either a list of tuples (non-c option) # or a dataframe (c option) #import sys #reload(sys) #stdout=sys.stdout failed = False terminal = False used_joblib = False #ds = ds[::-1] #todo: the number of blank lines to print can be way wrong if not root and print_info: from blessings import Terminal terminal = Terminal() print('\n' * (len(ds) - 2)) for dobj in ds: linenum = dobj['paralleling'] # this try handles nosetest problems in sublime text try: with terminal.location(0, terminal.height - (linenum + 1)): # this is a really bad idea. thetime = strftime("%H:%M:%S", localtime()) num_spaces = 26 - len(dobj['outname']) print('%s: QUEUED: %s' % (thetime, dobj['outname'])) except: pass if not root and multiprocess: try: res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) used_joblib = True except: failed = True print('Multiprocessing failed.') raise if not res: failed = True else: res = [] for index, d in enumerate(ds): d['startnum'] = (100 / denom) * index res.append(interrogator(**d)) try: res = sorted([i for i in res if i]) except: pass # remove unpicklable bits from query from types import ModuleType, FunctionType, BuiltinMethodType, BuiltinFunctionType badtypes = (ModuleType, FunctionType, BuiltinFunctionType, BuiltinMethodType) qlocs = {k: v for k, v in locs.items() if not isinstance(v, badtypes)} if hasattr(qlocs.get('corpus', False), 'name'): qlocs['corpus'] = qlocs['corpus'].path else: qlocs['corpus'] = list([i.path for i in qlocs.get('corpus', [])]) # return just a concordance from corpkit.interrogation import Concordance if kwargs.get('conc') == 'only': concs = pd.concat([x for x in res]) thetime = strftime("%H:%M:%S", localtime()) concs = concs.reset_index(drop=True) if kwargs.get('maxconc'): concs = concs[:kwargs.get('maxconc')] lines = Concordance(concs) if save: lines.save(save, print_info=print_info) if print_info: print('\n\n%s: Finished! %d results.\n\n' % (thetime, format(len(concs.index), ','))) return lines # return interrodict (to become multiindex) if isinstance(res[0], Interrodict) or not all(isinstance(i.results, Series) for i in res): out = OrderedDict() for interrog, d in zip(res, ds): for unpicklable in ['note', 'root']: interrog.query.pop(unpicklable, None) try: out[interrog.query['outname']] = interrog except KeyError: out[d['outname']] = interrog idict = Interrodict(out) if print_info: thetime = strftime("%H:%M:%S", localtime()) print("\n\n%s: Finished! Output is multi-indexed." % thetime) idict.query = qlocs if save: idict.save(save, print_info=print_info) return idict # make query and total branch, save, return # todo: standardise this so we don't have to guess transposes # else: if multiple == 'multiplecorpora' and not mult_corp_are_subs: sers = [i.results for i in res] out = DataFrame(sers, index=[i.query['outname'] for i in res]) out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols out = out.fillna(0) # nan to zero out = out.astype(int) # float to int out = out.T else: # make a series from counts if all(len(i.results) == 1 for i in res): out = pd.concat([r.results for r in res]) out = out.sort_index() else: try: out = pd.concat([r.results for r in res], axis=1) out = out.T out.index = [i.query['outname'] for i in res] except ValueError: return None # format like normal # this sorts subcorpora, which are cls out = out[sorted(list(out.columns))] # puts subcorpora in the right place if not mult_corp_are_subs and multiple != 'subcorpora': out = out.T if multiple == 'subcorpora': out = out.sort_index() out = out.fillna(0) # nan to zero out = out.astype(int) if 'c' in show and mult_corp_are_subs: out = out.sum() out.index = sorted(list(out.index)) # sort by total if isinstance(out, DataFrame): out = out[list(out.sum().sort_values(ascending=False).index)] # really need to figure out the deal with tranposing! if all(x.endswith('.xml') for x in list(out.columns)) \ or all(x.endswith('.txt') for x in list(out.columns)) \ or all(x.endswith('.conll') for x in list(out.columns)): out = out.T if kwargs.get('nosubmode'): out = out.sum() from corpkit.interrogation import Interrogation tt = out.sum(axis=1) if isinstance(out, DataFrame) else out.sum() out = Interrogation(results=out, totals=tt, query=qlocs) if hasattr(out, 'columns') and len(out.columns) == 1: out = out.sort_index() if kwargs.get('conc') is True: try: concs = pd.concat([x.concordance for x in res], ignore_index=True) concs = concs.sort_values(by='c') concs = concs.reset_index(drop=True) if kwargs.get('maxconc'): concs = concs[:kwargs.get('maxconc')] out.concordance = Concordance(concs) except ValueError: out.concordance = None thetime = strftime("%H:%M:%S", localtime()) if terminal: print(terminal.move(terminal.height-1, 0)) if print_info: if terminal: print(terminal.move(terminal.height-1, 0)) if hasattr(out.results, 'columns'): print('%s: Interrogation finished! %s unique results, %s total.' % (thetime, format(len(out.results.columns), ','), format(out.totals.sum(), ','))) else: print('%s: Interrogation finished! %s matches.' % (thetime, format(tt, ','))) if save: out.save(save, print_info = print_info) if list(out.results.index) == ['0'] and not kwargs.get('df1_always_df'): out.results = out.results.ix[0].sort_index() return out
def pmultiquery(corpus, search, show='words', query='any', sort_by='total', save=False, multiprocess='default', just_speakers=False, root=False, note=False, print_info=True, **kwargs ): """ - Parallel process multiple queries or corpora. - This function is used by corpkit.interrogator.interrogator() - for multiprocessing. - There's no reason to call this function yourself.""" import os from pandas import DataFrame, Series import pandas as pd import collections from collections import namedtuple, OrderedDict from time import strftime, localtime import corpkit from corpkit.interrogator import interrogator from corpkit.interrogation import Interrogation try: from joblib import Parallel, delayed except ImportError: pass import multiprocessing locs = locals() for k, v in kwargs.items(): locs[k] = v in_notebook = locs.get('in_notebook') def best_num_parallel(num_cores, num_queries): """decide how many parallel processes to run the idea, more or less, is to balance the load when possible""" import corpkit if num_queries <= num_cores: return num_queries if num_queries > num_cores: if (num_queries / num_cores) == num_cores: return int(num_cores) if num_queries % num_cores == 0: try: return max([int(num_queries / n) for n in range(2, num_cores) \ if int(num_queries / n) <= num_cores]) except ValueError: return num_cores else: import math if (float(math.sqrt(num_queries))).is_integer(): square_root = math.sqrt(num_queries) if square_root <= num_queries / num_cores: return int(square_root) return num_cores num_cores = multiprocessing.cpu_count() # what is our iterable? ... multiple_option = False multiple_queries = False multiple_speakers = False multiple_corpora = False multiple_search = False mult_corp_are_subs = False denom = 1 if hasattr(corpus, '__iter__'): multiple_corpora = True num_cores = best_num_parallel(num_cores, len(corpus)) denom = len(corpus) if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus): mult_corp_are_subs = True elif (isinstance(query, (list, dict)) and not hasattr(search, '__iter__')): multiple_queries = True num_cores = best_num_parallel(num_cores, len(query)) denom = len(query) elif hasattr(search, '__iter__') and all(isinstance(i, dict) for i in list(search.values())): multiple_search = True num_cores = best_num_parallel(num_cores, len(list(search.keys()))) denom = len(list(search.keys())) elif just_speakers: from build import get_speaker_names_from_xml_corpus multiple_speakers = True if just_speakers == 'each' or just_speakers == ['each']: just_speakers = get_speaker_names_from_xml_corpus(corpus.path) if len(just_speakers) == 0: print('No speaker name data found.') return num_cores = best_num_parallel(num_cores, len(just_speakers)) denom = len(just_speakers) if multiple_corpora and any(x is True for x in [multiple_speakers, multiple_queries, multiple_search, multiple_option]): from corpkit.corpus import Corpus, Corpora if isinstance(corpus, Corpora): multiprocess = False else: corpus = Corpus(corpus) if isinstance(multiprocess, int): num_cores = multiprocess if multiprocess is False: num_cores = 1 # make sure saves are right type if save is True: raise ValueError('save must be string when multiprocessing.') # the options that don't change d = {'function': 'interrogator', 'root': root, 'note': note, 'denominator': denom} # add kwargs to query for k, v in list(kwargs.items()): d[k] = v # make a list of dicts to pass to interrogator, # with the iterable unique in every one ds = [] if multiple_corpora: for index, p in enumerate(corpus): name = p.name a_dict = dict(d) a_dict['corpus'] = p a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name.replace('-parsed', '') a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_queries: for index, (name, q) in enumerate(query.items()): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = q a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_speakers: for index, name in enumerate(just_speakers): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = [name] a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_search: for index, (name, val) in enumerate(search.items()): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = val a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) if kwargs.get('conc') is False: message = 'Interrogating' elif kwargs.get('conc') is True: message = 'Interrogating and concordancing' elif kwargs.get('conc').lower() == 'only': message = 'Concordancing' time = strftime("%H:%M:%S", localtime()) sformat = '' if multiple_queries: to_it_over = query else: to_it_over = search for i, (k, v) in enumerate(list(to_it_over.items())): if isinstance(v, list): vformat = ', '.join(v[:5]) if len(v) > 5: vformat += ' ...' elif isinstance(v, dict): vformat = '' for kk, vv in v.items(): if isinstance(vv, list): vv = ', '.join(vv[:5]) vformat += '\n %s: %s' % (kk, vv) if len(vv) > 5: vformat += ' ...' else: try: vformat = v.pattern except AttributeError: vformat = v sformat += '%s: %s' %(k, vformat) if i < len(to_it_over.keys()) - 1: sformat += '\n ' if print_info: # proper printing for plurals # in truth this needs to be revised, it's horrible. if num_cores == 1: add_es = '' else: add_es = 'es' if multiple_corpora and not multiple_option: corplist = "\n ".join([i.name for i in corpus[:20]]) if len(corpus) > 20: corplist += '\n ... and %d more ...\n' % (len(corpus) - 20) print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s):\n %s" \ "\n Query: %s\n %s corpus ... \n" % (time, len(corpus), num_cores, add_es, corplist, sformat, message))) elif multiple_queries: print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \ "\n Queries: %s\n %s corpus ... \n" % (time, len(query), num_cores, add_es, corpus.name, sformat, message) )) elif multiple_search: print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \ "\n Queries: %s\n %s corpus ... \n" % (time, len(list(search.keys())), num_cores, add_es, corpus.name, sformat, message))) elif multiple_option: print(("\n%s: Beginning %d parallel corpus interrogation%s (multiple options): %s" \ "\n Query: %s\n %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) )) elif multiple_speakers: print(("\n%s: Beginning %d parallel corpus interrogation%s: %s" \ "\n Query: %s\n %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) )) # run in parallel, get either a list of tuples (non-c option) # or a dataframe (c option) #import sys #reload(sys) #stdout=sys.stdout failed = False terminal = False used_joblib = False #ds = ds[::-1] if not root and print_info: from blessings import Terminal terminal = Terminal() print('\n' * (len(ds) - 2)) for dobj in ds: linenum = dobj['paralleling'] # this try handles nosetest problems in sublime text try: with terminal.location(0, terminal.height - (linenum + 1)): # this is a really bad idea. thetime = strftime("%H:%M:%S", localtime()) num_spaces = 26 - len(dobj['outname']) print('%s: QUEUED: %s' % (thetime, dobj['outname'])) except: pass if not root and multiprocess: #res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) try: #ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True) res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) used_joblib = True except: failed = True print('Multiprocessing failed.') raise if not res: failed = True else: res = [] for index, d in enumerate(ds): d['startnum'] = (100 / denom) * index res.append(interrogator(**d)) try: res = sorted([i for i in res if i]) except: pass # remove unpicklable bits from query from types import ModuleType, FunctionType, BuiltinMethodType, BuiltinFunctionType badtypes = (ModuleType, FunctionType, BuiltinFunctionType, BuiltinMethodType) qlocs = {k: v for k, v in locs.items() if not isinstance(v, badtypes)} if hasattr(qlocs['corpus'], 'name'): qlocs['corpus'] = qlocs['corpus'].path else: qlocs['corpus'] = list([i.path for i in qlocs['corpus']]) from corpkit.interrogation import Concordance if kwargs.get('conc') == 'only': concs = pd.concat([x for x in res]) thetime = strftime("%H:%M:%S", localtime()) concs = concs.reset_index(drop=True) lines = Concordance(concs) if save: lines.save(save, print_info=print_info) if print_info: print('\n\n%s: Finished! %d results.\n\n' % (thetime, len(concs.index))) return lines if not all(isinstance(i.results, Series) for i in res): out = OrderedDict() for interrog, d in zip(res, ds): for unpicklable in ['note', 'root']: interrog.query.pop(unpicklable, None) try: out[interrog.query['outname']] = interrog except KeyError: out[d['outname']] = interrog from corpkit.interrogation import Interrodict idict = Interrodict(out) if print_info: time = strftime("%H:%M:%S", localtime()) print("\n\n%s: Finished! Output is a dictionary with keys:\n\n '%s'\n" % \ (time, "'\n '".join(sorted(out.keys())))) idict.query = qlocs if save: idict.save(save, print_info=print_info) return idict # make query and total branch, save, return # todo: standardise this so we don't have to guess transposes else: if multiple_corpora and not mult_corp_are_subs: sers = [i.results for i in res] out = DataFrame(sers, index=[i.query['outname'] for i in res]) out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols out = out.fillna(0) # nan to zero out = out.astype(int) # float to int out = out.T else: try: out = pd.concat([r.results for r in res], axis=1) out = out.T out.index = [i.query['outname'] for i in res] except ValueError: return None # format like normal # this sorts subcorpora, which are cls out = out[sorted(list(out.columns))] # puts subcorpora in the right place if not mult_corp_are_subs: out = out.T out = out.fillna(0) # nan to zero out = out.astype(int) if 'c' in show and mult_corp_are_subs: out = out.sum() out.index = sorted(list(out.index)) # sort by total if isinstance(out, DataFrame): out = out[list(out.sum().sort_values(ascending=False).index)] # really need to figure out the deal with tranposing! if all(x.endswith('.xml') for x in list(out.columns)) \ or all(x.endswith('.txt') for x in list(out.columns)): out = out.T out = out.edit(sort_by=sort_by, print_info=False, keep_stats=False, \ df1_always_df=kwargs.get('df1_always_df')) out.query = qlocs if len(out.results.columns) == 1: out.results = out.results.sort_index() if kwargs.get('conc') is True: concs = pd.concat([x.concordance for x in res], ignore_index=True) concs = concs.sort_values(by='c') concs = concs.reset_index(drop=True) out.concordance = Concordance(concs) thetime = strftime("%H:%M:%S", localtime()) if terminal and print_info: with terminal.location(0, terminal.height): print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n')) else: if print_info: print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n')) if save: out.save(save, print_info = print_info) return out
def compare_survival(y, group_indicator, return_stats=False): """K-sample log-rank hypothesis test of identical survival functions. Compares the pooled hazard rate with each group-specific hazard rate. The alternative hypothesis is that the hazard rate of at least one group differs from the others at some time. See [1]_ for more details. Parameters ---------- y : structured array, shape = (n_samples,) A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field. group_indicator : array-like, shape = (n_samples,) Group membership of each sample. return_stats : bool, optional, default: False Whether to return a data frame with statistics for each group and the covariance matrix of the test statistic. Returns ------- chisq : float Test statistic. pvalue : float Two-sided p-value with respect to the null hypothesis that the hazard rates across all groups are equal. stats : pandas.DataFrame Summary statistics for each group: number of samples, observed number of events, expected number of events, and test statistic. Only provided if `return_stats` is True. covariance : array, shape=(n_groups, n_groups) Covariance matrix of the test statistic. Only provided if `return_stats` is True. References ---------- .. [1] Fleming, T. R. and Harrington, D. P. A Class of Hypothesis Tests for One and Two Samples of Censored Survival Data. Communications In Statistics 10 (1981): 763-794. """ group_indicator, event, time = check_arrays_survival(group_indicator, y, dtype="O", ensure_2d=False) n_samples = time.shape[0] groups, group_counts = numpy.unique(group_indicator, return_counts=True) n_groups = groups.shape[0] if n_groups == 1: raise ValueError("At least two groups must be specified, " "but only one was provided.") # sort descending o = numpy.argsort(-time, kind="mergesort") x = group_indicator[o] event = event[o] time = time[o] at_risk = numpy.zeros(n_groups, dtype=numpy.int_) observed = numpy.zeros(n_groups, dtype=numpy.int_) expected = numpy.zeros(n_groups, dtype=numpy.float_) covar = numpy.zeros((n_groups, n_groups), dtype=numpy.float_) k = 0 while k < n_samples: ti = time[k] total_events = 0 while k < n_samples and ti == time[k]: idx = numpy.searchsorted(groups, x[k]) if event[k]: observed[idx] += 1 total_events += 1 at_risk[idx] += 1 k += 1 if total_events != 0: total_at_risk = k expected += at_risk * (total_events / total_at_risk) if total_at_risk > 1: multiplier = total_events * (total_at_risk - total_events) / ( total_at_risk * (total_at_risk - 1)) for g1 in range(n_groups): temp = at_risk[g1] * multiplier covar[g1, g1] += temp for g2 in range(n_groups): covar[g1, g2] -= temp * at_risk[g2] / total_at_risk df = n_groups - 1 zz = observed[:df] - expected[:df] chisq = numpy.linalg.solve(covar[:df, :df], zz).dot(zz) pval = stats.chi2.sf(chisq, df) if return_stats: table = OrderedDict() table["counts"] = group_counts table["observed"] = observed table["expected"] = expected table["statistic"] = observed - expected table = pandas.DataFrame.from_dict(table) table.index = pandas.Index(groups, name="group") return chisq, pval, table, covar return chisq, pval
def rank(Input1, Input2, D, R): f = h5py.File('all_data', 'w') content = [word.strip() for word in open(Input)] test_content = [word.strip() for word in open(Input2)] k = 0 rank = [] tr_rank = [] ResultMatrix = R.todense() TruthMatrix = D.todense() ranking = [] for i in content: l = 0 d = {} e = {} r = 0 rc = 0 tr = 0 iter = 0 trr = 0 iterr = 0 # print i # print "\t Truth \t\t Calculation" # print "\t________________________________" for j in test_content: d[str(j)] = ResultMatrix[k, l] e[str(j)] = TruthMatrix[k, l] l += 1 C = OrderedDict(reversed(sorted(d.items(), key=lambda t: np.float(t[1])))).keys() T = OrderedDict(reversed(sorted(e.items(), key=lambda t: np.float(t[1])))).keys() tar_words = [] list_rank = 0 for m in range(0,5): print '\t', T[m], '\t\t' ,C[m] tar_words.append(C[m]) rc += (C.index(T[m])+1) iter += 1 tr += iter list_rank = float(rc/5.0) ranking.append(list_rank) print "\t_________________________________" print "\t", tr, "\t\t", rc, '\t', list_rank k += 1 # list_ind = [] # words = [] # # for n in T: # x = (C.index(n) - T.index(n)) # r += x ** 2 # list_ind.append((C.index(n) + 1)) # words.append(n) # iterr += 1 # trr += iterr label_rank = i + '_rank' label_words = i + '_words' f.create_dataset(label_rank, data=list_rank) f.create_dataset(label_words, data=tar_words) avg_rank = (float(sum(ranking)/337.0)) label_avg_rank = '_avg_rank' f.create_dataset(label_avg_rank, data=avg_rank) print "Average Rank", avg_rank f.close()
if cat in dictchannels: if not cat.startswith("VOD"): for x in dictchannels[cat]: x['serviceRef'] = "{}:0:1:{:x}:0:0:0:0:0:0".format( x['streamType'], num) num += 1 else: for x in dictchannels[cat]: x['serviceRef'] = "{}:0:1:{:x}:0:0:0:0:0:0".format( x['streamType'], vod_service_id) while (catstartnum < num): catstartnum += category_offset # move all VOD categories to VOD placeholder position if ("VOD" in category_order): vodindex = category_order.index("VOD") vodcategories = list( (cat for cat in category_order if cat.startswith('VOD -'))) if len(vodcategories): #remove the multi vod categories from their current location category_order = [ x for x in category_order if x not in vodcategories ] #insert the multi vod categories at the placeholder pos category_order[vodindex:vodindex] = vodcategories category_order.remove("VOD") # Check for and parse override map self.parse_map_channels_xml(dictchannels) # Have a look at what we have
def pmultiquery(corpus, search, show='words', query='any', sort_by='total', quicksave=False, multiprocess='default', just_speakers=False, root=False, note=False, print_info=True, **kwargs): """Parallel process multiple queries or corpora. This function is used by interrogator() for multiprocessing. There's no reason to call this function yourself.""" import collections import os import pandas as pd import collections from collections import namedtuple from time import strftime, localtime import corpkit from interrogator import interrogator from editor import editor from other import save from interrogation import Interrogation try: from joblib import Parallel, delayed except: pass #raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \ # 'Install with:\n\n pip install joblib') import multiprocessing locs = locals() for k, v in kwargs.items(): locs[k] = v def best_num_parallel(num_cores, num_queries): import corpkit """decide how many parallel processes to run the idea, more or less, is to balance the load when possible""" if num_queries <= num_cores: return num_queries if num_queries > num_cores: if (num_queries / num_cores) == num_cores: return int(num_cores) if num_queries % num_cores == 0: try: return max([ int(num_queries / n) for n in range(2, num_cores) if int(num_queries / n) <= num_cores ]) except ValueError: return num_cores else: import math if (float(math.sqrt(num_queries))).is_integer(): square_root = math.sqrt(num_queries) if square_root <= num_queries / num_cores: return int(square_root) return num_cores num_cores = multiprocessing.cpu_count() # what is our iterable? ... multiple_option = False multiple_queries = False multiple_speakers = False multiple_corpora = False multiple_search = False mult_corp_are_subs = False denom = 1 if hasattr(corpus, '__iter__'): multiple_corpora = True num_cores = best_num_parallel(num_cores, len(corpus)) denom = len(corpus) if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus): mult_corp_are_subs = True elif (type(query) == list or type(query) == dict) and not hasattr(search, '__iter__'): multiple_queries = True num_cores = best_num_parallel(num_cores, len(query)) denom = len(query) elif hasattr(search, '__iter__') and all( type(i) == dict for i in list(search.values())): multiple_search = True num_cores = best_num_parallel(num_cores, len(list(search.keys()))) denom = len(list(search.keys())) elif just_speakers: from build import get_speaker_names_from_xml_corpus multiple_speakers = True if just_speakers == 'each' or just_speakers == ['each']: just_speakers = get_speaker_names_from_xml_corpus(corpus.path) if len(just_speakers) == 0: print('No speaker name data found.') return num_cores = best_num_parallel(num_cores, len(just_speakers)) denom = len(just_speakers) # if this thing has already come through multiquery, don't multiprocess this time #if kwargs.get('outname'): # multiprocess = False if multiple_corpora and any(x is True for x in [ multiple_speakers, multiple_queries, multiple_search, multiple_option ]): from corpus import Corpus, Corpora if corpus.__class__ == Corpora: multiprocess = False else: corpus = Corpus(corpus) if type(multiprocess) == int: num_cores = multiprocess if multiprocess is False: num_cores = 1 # make sure quicksaves are right type if quicksave is True: raise ValueError('quicksave must be string when using pmultiquery.') # the options that don't change d = { #'paralleling': True, 'function': 'interrogator', 'root': root, 'note': note, 'denominator': denom } # add kwargs to query for k, v in list(kwargs.items()): d[k] = v # make a list of dicts to pass to interrogator, # with the iterable unique in every one ds = [] if multiple_corpora: for index, p in enumerate(corpus): name = p.name a_dict = dict(d) a_dict['corpus'] = p a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name.replace('-parsed', '') a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_queries: for index, (name, q) in enumerate(query.items()): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = q a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_speakers: for index, name in enumerate(just_speakers): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = [name] a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_search: for index, (name, val) in enumerate(search.items()): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = val a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) if kwargs.get('do_concordancing') is False: message = 'Interrogating' elif kwargs.get('do_concordancing') is True: message = 'Interrogating and concordancing' elif kwargs.get('do_concordancing').lower() == 'only': message = 'Concordancing' time = strftime("%H:%M:%S", localtime()) sformat = '' if multiple_queries: to_it_over = query else: to_it_over = search for i, (k, v) in enumerate(list(to_it_over.items())): if type(v) == list: vformat = ', '.join(v[:5]) if len(v) > 5: vformat += ' ...' elif type(v) == dict: vformat = '' for kk, vv in v.items(): if type(vv) == list: vv = ', '.join(vv[:5]) vformat += '\n %s: %s' % (kk, vv) if len(vv) > 5: vformat += ' ...' else: vformat = v sformat += '%s: %s' % (k, vformat) if i < len(to_it_over.keys()) - 1: sformat += '\n ' if print_info: if multiple_corpora and not multiple_option: corplist = "\n ".join([i.name for i in corpus[:20]]) if len(corpus) > 20: corplist += '\n ... and %d more ...\n' % (len(corpus) - 20) print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes):\n %s" \ "\n Query: %s\n %s corpus ... \n" % (time, len(corpus), num_cores, corplist, sformat, message))) elif multiple_queries: print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \ "\n Queries: %s\n %s corpus ... \n" % (time, len(query), num_cores, corpus.name, sformat, message) )) elif multiple_search: print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \ "\n Queries: %s\n %s corpus ... \n" % (time, len(list(search.keys())), num_cores, corpus.name, sformat, message))) elif multiple_option: print(("\n%s: Beginning %d parallel corpus interrogations (multiple options): %s" \ "\n Query: %s\n %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) )) elif multiple_speakers: print(("\n%s: Beginning %d parallel corpus interrogations: %s" \ "\n Query: %s\n %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) )) # run in parallel, get either a list of tuples (non-c option) # or a dataframe (c option) #import sys #reload(sys) #stdout=sys.stdout failed = False terminal = False used_joblib = False #ds = ds[::-1] if not root and print_info: from blessings import Terminal terminal = Terminal() print('\n' * (len(ds) - 2)) for dobj in ds: linenum = dobj['paralleling'] # this try handles nosetest problems in sublime text try: with terminal.location(0, terminal.height - (linenum + 1)): # this is a really bad idea. thetime = strftime("%H:%M:%S", localtime()) num_spaces = 26 - len(dobj['outname']) print('%s: QUEUED: %s' % (thetime, dobj['outname'])) except: pass if not root and multiprocess: #res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) try: #ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True) res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) used_joblib = True except: failed = True print('Multiprocessing failed.') raise if not res: failed = True else: res = [] for index, d in enumerate(ds): d['startnum'] = (100 / denom) * index res.append(interrogator(**d)) try: res = sorted(res) except: pass # multiprocessing way #from multiprocessing import Process #from interrogator import interrogator #jobs = [] ##for d in ds: ## p = multiprocessing.Process(target=interrogator, kwargs=(**d,)) ## jobs.append(p) ## p.start() ## while p.is_alive(): ## import time ## time.sleep(2) ## if root: ## root.update() #result_queue = multiprocessing.Queue() # #for d in ds: #funs = [interrogator(result_queue, **kwargs) for kwargs in ds] #jobs = [multiprocessing.Process(mc) for mc in funs] #for job in jobs: job.start() #for job in jobs: job.join() #results = [result_queue.get() for mc in funs] import corpkit from interrogation import Concordance if kwargs.get('do_concordancing') == 'only': concs = pd.concat([x for x in res]) thetime = strftime("%H:%M:%S", localtime()) if print_info: print('\n\n%s: Finished! %d results.\n\n' % (thetime, len(concs.index))) return Concordance(concs) from collections import OrderedDict if not all(type(i.results) == pd.core.series.Series for i in res): out = OrderedDict() for interrog, d in zip(res, ds): for unpicklable in ['note', 'root']: interrog.query.pop(unpicklable, None) try: out[interrog.query['outname']] = interrog except KeyError: out[d['outname']] = interrog if quicksave: fullpath = os.path.join('saved_interrogations', quicksave) while os.path.isdir(fullpath): selection = input( "\nSave error: %s already exists in %s.\n\nType 'o' to overwrite, or enter a new name: " % (quicksave, 'saved_interrogations')) if selection == 'o' or selection == 'O': import shutil shutil.rmtree(fullpath) else: import os fullpath = os.path.join('saved_interrogations', selection) for k, v in list(out.items()): save(v, k, savedir=fullpath, print_info=False) time = strftime("%H:%M:%S", localtime()) print("\n%s: %d files saved to %s" % (time, len(list(out.keys())), fullpath)) time = strftime("%H:%M:%S", localtime()) if print_info: print( "\n\n%s: Finished! Output is a dictionary with keys:\n\n '%s'\n" % (time, "'\n '".join(sorted(out.keys())))) from interrogation import Interrodict idict = Interrodict(out) # remove unpicklable bits from query from types import ModuleType, FunctionType, BuiltinMethodType, BuiltinFunctionType locs = {k: v for k, v in locs.items() if not isinstance(v, ModuleType) \ and not isinstance(v, FunctionType) \ and not isinstance(v, BuiltinFunctionType) \ and not isinstance(v, BuiltinMethodType)} idict.query = locs return idict # make query and total branch, save, return else: #print sers #print ds if multiple_corpora and not mult_corp_are_subs: sers = [i.results for i in res] out = pd.DataFrame(sers, index=[i.query['outname'] for i in res]) out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols out = out.fillna(0) # nan to zero out = out.astype(int) # float to int out = out.T else: try: out = pd.concat([r.results for r in res], axis=1) except ValueError: return None # format like normal out = out[sorted(list(out.columns))] out = out.T out = out.fillna(0) # nan to zero out = out.astype(int) if 'c' in show and mult_corp_are_subs: out = out.sum() out.index = sorted(list(out.index)) # sort by total if type(out) == pd.core.frame.DataFrame: out.ix['Total-tmp'] = out.sum() tot = out.ix['Total-tmp'] out = out[tot.argsort()[::-1]] out = out.drop('Total-tmp', axis=0) out = out.edit(sort_by = sort_by, print_info = False, keep_stats = False, \ df1_always_df = kwargs.get('df1_always_df')) if len(out.results.columns) == 1: out.results = out.results.sort_index() if kwargs.get('do_concordancing') is True: concs = pd.concat([x.concordance for x in res], ignore_index=True) concs = concs.sort_values(by='c') concs = concs.reset_index(drop=True) out.concordance = Concordance(concs) thetime = strftime("%H:%M:%S", localtime()) if terminal and print_info: with terminal.location(0, terminal.height): print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len( out.results.columns), out.totals.sum(), '\n')) else: if print_info: print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len( out.results.columns), out.totals.sum(), '\n')) #if used_joblib: if quicksave: from other import save save(out, quicksave) return out
def assign_variables(assignment_expressions, df, locals_dict, df_alias=None, trace_rows=None): """ Evaluate a set of variable expressions from a spec in the context of a given data table. Expressions are evaluated using Python's eval function. Python expressions have access to variables in locals_d (and df being accessible as variable df.) They also have access to previously assigned targets as the assigned target name. lowercase variables starting with underscore are temp variables (e.g. _local_var) and not returned except in trace_results uppercase variables starting with underscore are temp scalar variables (e.g. _LOCAL_SCALAR) and not returned except in trace_assigned_locals This is useful for defining general purpose local constants in expression file Users should take care that expressions (other than temp scalar variables) should result in a Pandas Series (scalars will be automatically promoted to series.) Parameters ---------- assignment_expressions : pandas.DataFrame of target assignment expressions target: target column names expression: pandas or python expression to evaluate df : pandas.DataFrame locals_d : Dict This is a dictionary of local variables that will be the environment for an evaluation of "python" expression. trace_rows: series or array of bools to use as mask to select target rows to trace Returns ------- variables : pandas.DataFrame Will have the index of `df` and columns named by target and containing the result of evaluating expression trace_df : pandas.DataFrame or None a dataframe containing the eval result values for each assignment expression """ np_logger = NumpyLogger(logger) def is_throwaway(target): return target == '_' def is_temp_scalar(target): return target.startswith('_') and target.isupper() def is_temp(target): return target.startswith('_') def to_series(x): if x is None or np.isscalar(x): return pd.Series([x] * len(df.index), index=df.index) return x assert assignment_expressions.shape[0] > 0 trace_assigned_locals = trace_results = None if trace_rows is not None: # convert to numpy array so we can slice ndarrays as well as series trace_rows = np.asanyarray(trace_rows) if trace_rows.any(): trace_results = OrderedDict() trace_assigned_locals = OrderedDict() # avoid touching caller's passed-in locals_d parameter (they may be looping) _locals_dict = local_utilities() if locals_dict is not None: _locals_dict.update(locals_dict) if df_alias: _locals_dict[df_alias] = df else: _locals_dict['df'] = df local_keys = list(_locals_dict.keys()) # build a dataframe of eval results for non-temp targets # since we allow targets to be recycled, we want to only keep the last usage variables = OrderedDict() # need to be able to identify which variables causes an error, which keeps # this from being expressed more parsimoniously for e in zip(assignment_expressions.target, assignment_expressions.expression): target, expression = e assert isinstance(target, str), \ "expected target '%s' for expression '%s' to be string not %s" % \ (target, expression, type(target)) if target in local_keys: logger.warning( "assign_variables target obscures local_d name '%s'", str(target)) if is_temp_scalar(target) or is_throwaway(target): try: x = eval(expression, globals(), _locals_dict) except Exception as err: logger.error("assign_variables error: %s: %s", type(err).__name__, str(err)) logger.error("assign_variables expression: %s = %s", str(target), str(expression)) raise err if not is_throwaway(target): _locals_dict[target] = x if trace_assigned_locals is not None: trace_assigned_locals[uniquify_key(trace_assigned_locals, target)] = x continue try: # FIXME - log any numpy warnings/errors but don't raise np_logger.target = str(target) np_logger.expression = str(expression) saved_handler = np.seterrcall(np_logger) save_err = np.seterr(all='log') # FIXME should whitelist globals for security? globals_dict = {} expr_values = to_series( eval(expression, globals_dict, _locals_dict)) np.seterr(**save_err) np.seterrcall(saved_handler) except Exception as err: logger.error("assign_variables error: %s: %s", type(err).__name__, str(err)) logger.error("assign_variables expression: %s = %s", str(target), str(expression)) raise err if not is_temp(target): variables[target] = expr_values if trace_results is not None: trace_results[uniquify_key(trace_results, target)] = expr_values[trace_rows] # update locals to allows us to ref previously assigned targets _locals_dict[target] = expr_values if trace_results is not None: trace_results = pd.DataFrame.from_dict(trace_results) trace_results.index = df[trace_rows].index # add df columns to trace_results trace_results = pd.concat([df[trace_rows], trace_results], axis=1) # we stored result in dict - convert to df variables = util.df_from_dict(variables, index=df.index) return variables, trace_results, trace_assigned_locals
class Encoder: def __init__(self): self.keysets = [] self.keysetsindex = 0 self.stringhist = {} self.output = [] def createstringhist(self, obj): if type(obj) == str or type(obj) == unicode: obj = obj.encode('utf-8') self.stringhist[obj] = self.stringhist.get(obj, 0) + 1 return if type(obj) == list: for p in obj: self.createstringhist(p) elif type(obj) == dict: for p in obj: self.createstringhist(obj[p]) def encode(self, obj, keysetstoomit=None): if keysetstoomit == None: keysetstoomit = [] self.keysets = [tuple(p) for p in keysetstoomit] self.keysetsindex = len(self.keysets) self.output = [] self.stringhist = OrderedDict() self.createstringhist(obj) self.stringhist = [(p, ((len(p) + 1) * self.stringhist[p]) - (len(p) + 2 + self.stringhist[p])) for p in self.stringhist] self.stringhist = [(-q, p) for (p, q) in self.stringhist if q > 0] self.stringhist = [p for (q, p) in sorted(self.stringhist)] q = self.stringhist if len(self.stringhist) > 255: self.stringhist = self.stringhist[:255] self.write(obj) x = self.output self.output = [] self.stringhist = [] self.output.append(TypeTags.STRLUT) self.output.append(len(q)) for p in q: self.writeStr(p) self.writeArray(self.keysets[self.keysetsindex:]) return self.output + x def write(self, x): if type(x) == type(None): self.output.append(TypeTags.NULL) elif x == NULL: self.output.append(TypeTags.NULL) elif x == UNDEFINED: self.output.append(TypeTags.UNDEFINED) elif x == DEFAULT: self.output += [TypeTags.EXTENSION, 0, TypeTags.UNDEFINED] elif x == INFINITY: self.output += [TypeTags.FLOAT32, 0x7F, 0x80, 0x00, 0x00] elif x == NEGINFINITY: self.output += [TypeTags.FLOAT32, 0xFF, 0x80, 0x00, 0x00] elif x == NAN: self.output += [TypeTags.FLOAT32, 0x7F, 0xC0, 0x00, 0x00] else: ENCODERS[type(x)](self, x) def writeBoolean(self, x): if x: self.output.append(TypeTags.TRUE) else: self.output.append(TypeTags.FALSE) def writeInt(self, x): if abs(x) > 0xFFFFFFFFFFFFFFFF: return self.writeFloat(x) if x >= 0: if x < 64: self.output.append(x) elif x <= 0x3FFF: self.output += [TypeTags.UINT14_BASE | x >> 8, x & 0xFF] elif x <= 0xFFFF: self.output += [TypeTags.UINT16, x >> 8, x & 0xFF] elif x <= 0xFFFFFF: self.output += [ TypeTags.UINT24, x >> 16, x >> 8 & 0xFF, x & 0xFF ] elif x <= 0xFFFFFFFF: self.output.append(TypeTags.UINT32) self.output += int32tobytes(x) else: self.output.append(TypeTags.UINT64) self.output += int32tobytes(x >> 32 & 0xFFFFFFFF) self.output += int32tobytes(x & 0xFFFFFFFF) else: x = -x if x <= 15: self.output.push(TypeTags.NINT4_BASE | x) elif x <= 0xFF: self.output += [TypeTags.NINT8, x] elif x <= 0xFFFF: self.output += [TypeTags.NINT16, x >> 8, x & 0xFF] elif x <= 0xFFFFFFFF: self.output.append(TypeTags.NINT32) self.output += int32tobytes(x) else: self.output.append(TypeTags.NINT64) self.output += int32tobytes(x >> 32 & 0xFFFFFFFF) self.output += int32tobytes(x & 0xFFFFFFFF) def writeFloat(self, x): m = struct.pack("f", x) if x == struct.unpack("f", m)[0]: self.output.append(TypeTags.FLOAT32) else: self.output.append(TypeTags.DOUBLE64) m = struct.pack("d", x) self.output += [ord(m[i]) for i in range(len(m) - 1, -1, -1)] def writeStr(self, x): x = x.encode('utf-8') if x in self.stringhist: self.output += [TypeTags.STRREF, self.stringhist.index(x)] return z = False xe = [] for p in x: p = ord(p) if p == 0: z = True xe.append(p) x = xe y = len(x) if y < 32: self.output.append(TypeTags.STR5_BASE | y) self.output += x else: if not z: self.output.append(TypeTags.CSTRING) self.output += x self.output.append(0) else: if y <= 255: self.output += [TypeTags.STR8, y] else: self.output.append(TypeTags.STR_) self.writeInt(y) self.output += x def writeArray(self, x): isboolarray = True for p in x: if type(x) != bool: isboolarray = False break if isboolarray: if len(x) < 16: self.output.append(TypeTags.BARRAY4_BASE | len(x)) elif len(x) < 256: self.output.append(TypeTags.BARRAY8) self.output.append(len(x)) else: self.output.append(TypeTags.BARRAY_) self.writeInt(len(x)) self.output += bytefrombools(x) else: if len(x) < 32: self.output.append(TypeTags.ARRAY5_BASE | len(x)) elif len(x) <= 255: self.output.append(TypeTags.ARRAY8) self.output.append(len(x)) else: self.output.append(TypeTags.ARRAY_) self.writeInt(len(x)) for p in x: self.write(p) def writeDict(self, x): y = tuple(sorted(x.keys())) try: k = self.keysets.index(y) except: self.keysets.append(y) k = len(self.keysets) - 1 isbool = True for p in y: if type(x[p]) != bool: isbool = False break if isbool: self.output.append(TypeTags.BMAP_) self.writeInt(k) self.output += bytefrombools([x[p] for p in y]) else: self.output.append(TypeTags.MAP_) self.writeInt(k) for p in y: self.write(x[p])
def eval_interaction_utilities(spec, df, locals_d, trace_label, trace_rows, estimator=None): """ Compute the utilities for a single-alternative spec evaluated in the context of df We could compute the utilities for interaction datasets just as we do for simple_simulate specs with multiple alternative columns byt calling eval_variables and then computing the utilities by matrix-multiplication of eval results with the utility coefficients in the spec alternative columns. But interaction simulate computes the utilities of each alternative in the context of a separate row in interaction dataset df, and so there is only one alternative in spec. This turns out to be quite a bit faster (in this special case) than the pandas dot function. For efficiency, we combine eval_variables and multiplication of coefficients into a single step, so we don't have to create a separate column for each partial utility. Instead, we simply multiply the eval result by a single alternative coefficient and sum the partial utilities. spec : dataframe one row per spec expression and one col with utility coefficient df : dataframe cross join (cartesian product) of choosers with alternatives combines columns of choosers and alternatives len(df) == len(choosers) * len(alternatives) index values (non-unique) are index values from alternatives df interaction_utilities : dataframe the utility of each alternative is sum of the partial utilities determined by the various spec expressions and their corresponding coefficients yielding a dataframe with len(interaction_df) rows and one utility column having the same index as interaction_df (non-unique values from alternatives df) Returns ------- utilities : pandas.DataFrame Will have the index of `df` and a single column of utilities """ trace_label = tracing.extend_trace_label(trace_label, "eval_interaction_utils") logger.info("Running eval_interaction_utilities on %s rows" % df.shape[0]) assert(len(spec.columns) == 1) # avoid altering caller's passed-in locals_d parameter (they may be looping) locals_d = locals_d.copy() if locals_d is not None else {} locals_d.update(locals()) def to_series(x): if np.isscalar(x): return pd.Series([x] * len(df), index=df.index) if isinstance(x, np.ndarray): return pd.Series(x, index=df.index) return x if trace_rows is not None and trace_rows.any(): # # convert to numpy array so we can slice ndarrays as well as series # trace_rows = np.asanyarray(trace_rows) assert type(trace_rows) == np.ndarray trace_eval_results = OrderedDict() else: trace_eval_results = None check_for_variability = config.setting('check_for_variability') # need to be able to identify which variables causes an error, which keeps # this from being expressed more parsimoniously utilities = pd.DataFrame({'utility': 0.0}, index=df.index) no_variability = has_missing_vals = 0 if estimator: # ensure alt_id from interaction_dataset is available in expression_values_df for # estimator.write_interaction_expression_values and eventual omnibus table assembly alt_id = estimator.get_alt_id() assert alt_id in df.columns expression_values_df = df[[alt_id]] # FIXME estimation_requires_chooser_id_in_df_column # estimation requires that chooser_id is either in index or a column of interaction_dataset # so it can be reformatted (melted) and indexed by chooser_id and alt_id # we assume caller has this under control if index is named if df.index.name is None: chooser_id = estimator.get_chooser_id() assert chooser_id in df.columns, \ "Expected to find choose_id column '%s' in interaction dataset" % (chooser_id, ) assert df.index.name is None expression_values_df[chooser_id] = df[chooser_id] if isinstance(spec.index, pd.MultiIndex): exprs = spec.index.get_level_values(simulate.SPEC_EXPRESSION_NAME) labels = spec.index.get_level_values(simulate.SPEC_LABEL_NAME) else: exprs = spec.index labels = spec.index for expr, label, coefficient in zip(exprs, labels, spec.iloc[:, 0]): try: # - allow temps of form _od_DIST@od_skim['DIST'] if expr.startswith('_'): target = expr[:expr.index('@')] rhs = expr[expr.index('@') + 1:] v = to_series(eval(rhs, globals(), locals_d)) # update locals to allows us to ref previously assigned targets locals_d[target] = v if trace_eval_results is not None: trace_eval_results[expr] = v[trace_rows] # mem.trace_memory_info("eval_interaction_utilities TEMP: %s" % expr) continue if expr.startswith('@'): v = to_series(eval(expr[1:], globals(), locals_d)) else: v = df.eval(expr) if check_for_variability and v.std() == 0: logger.info("%s: no variability (%s) in: %s" % (trace_label, v.iloc[0], expr)) no_variability += 1 # FIXME - how likely is this to happen? Not sure it is really a problem? if check_for_variability and np.count_nonzero(v.isnull().values) > 0: logger.info("%s: missing values in: %s" % (trace_label, expr)) has_missing_vals += 1 if estimator: # in case we modified expression_values_df index v = v.values if isinstance(v, pd.Series) else v expression_values_df.insert(loc=len(expression_values_df.columns), column=label, value=v.values if isinstance(v, pd.Series) else v) utilities.utility += (v * coefficient).astype('float') if trace_eval_results is not None: # expressions should have been uniquified when spec was read # (though we could do it here if need be...) # expr = assign.uniquify_key(trace_eval_results, expr, template="{} # ({})") assert expr not in trace_eval_results trace_eval_results[expr] = v[trace_rows] k = 'partial utility (coefficient = %s) for %s' % (coefficient, expr) trace_eval_results[k] = v[trace_rows] * coefficient except Exception as err: logger.exception(f"{trace_label} - {type(err).__name__} ({str(err)}) evaluating: {str(expr)}") raise err # mem.trace_memory_info("eval_interaction_utilities: %s" % expr) if estimator: estimator.log("eval_interaction_utilities write_interaction_expression_values %s" % trace_label) estimator.write_interaction_expression_values(expression_values_df) del expression_values_df if no_variability > 0: logger.warning("%s: %s columns have no variability" % (trace_label, no_variability)) if has_missing_vals > 0: logger.warning("%s: %s columns have missing values" % (trace_label, has_missing_vals)) if trace_eval_results is not None: trace_eval_results['total utility'] = utilities.utility[trace_rows] trace_eval_results = pd.DataFrame.from_dict(trace_eval_results) trace_eval_results.index = df[trace_rows].index # add df columns to trace_results trace_eval_results = pd.concat([df[trace_rows], trace_eval_results], axis=1) return utilities, trace_eval_results
if 'and' in condition_required.keys(): cond_2 = condition_required['and'] if cond_2[0].replace('-','').isdigit(): cond_2.reverse() connection = cond_2[1] if cond_2[1] == '<': connection = '>' if cond_2[1] == '>': connection = '<' if cond_2[1] == '<=': connection = '>=' if cond_2[1] == '>=': connection = '<=' cond_2[1] = connection table_2 = {cond_2.index(x): x.split('.')[0] for x in cond_2 if '.' in x} cond_2 = {cond_2.index(x): x.split('.')[-1] for x in cond_2} connection = 'and' elif 'or' in condition_required.keys(): cond_2 = condition_required['or'] if cond_2[0].replace('-','').isdigit(): cond_2.reverse() connection = cond_2[1] if cond_2[1] == '<': connection = '>' if cond_2[1] == '>': connection = '<' if cond_2[1] == '<=': connection = '>=' if cond_2[1] == '>=': connection = '<='