def process_file(self, filename, delta_threshold = 0.05, freq_threshold = 1, save = True): candidates = [] starting = time.time() #Initialize Beam Search class BS = BeamSearch(delta_threshold, self.association_dict) for line in self.Encoder.load_stream(filename): if len(line) > 2: #Beam Search extraction candidates += BS.beam_search(line) #Count each candidate, get dictionary with candidate frequencies candidates = ct.frequencies(candidates) print("\t" + str(len(candidates)) + " candidates before pruning.") #Reduce nonce candidates above_zero = lambda x: x > freq_threshold candidates = ct.valfilter(above_zero, candidates) #Print time and number of remaining candidates print("\t" + str(len(candidates)) + " candidates in " + str(time.time() - starting) + " seconds.") if save == True: self.Loader.save_file(candidates, filename + ".candidates.p") return os.path.join(self.Loader.output_dir, filename + ".candidates.p") else: return candidates
def test_class_sigs(): """ Test that all ``cdef class`` extension types in ``cytoolz`` have correctly embedded the function signature as done in ``toolz``. """ import toolz # only consider items created in both `toolz` and `cytoolz` toolz_dict = valfilter(isfrommod('toolz'), toolz.__dict__) cytoolz_dict = valfilter(isfrommod('cytoolz'), cytoolz.__dict__) # only test `cdef class` extensions from `cytoolz` cytoolz_dict = valfilter(lambda x: not isinstance(x, BuiltinFunctionType), cytoolz_dict) # full API coverage should be tested elsewhere toolz_dict = keyfilter(lambda x: x in cytoolz_dict, toolz_dict) cytoolz_dict = keyfilter(lambda x: x in toolz_dict, cytoolz_dict) d = merge_with(identity, toolz_dict, cytoolz_dict) for key, (toolz_func, cytoolz_func) in d.items(): if key in ['excepts', 'juxt', 'memoize', 'flip']: continue try: # function toolz_spec = inspect.getargspec(toolz_func) except TypeError: try: # curried or partial object toolz_spec = inspect.getargspec(toolz_func.func) except (TypeError, AttributeError): # class toolz_spec = inspect.getargspec(toolz_func.__init__) # For Cython < 0.25 toolz_sig = toolz_func.__name__ + inspect.formatargspec(*toolz_spec) doc = cytoolz_func.__doc__ # For Cython >= 0.25 toolz_sig_alt = toolz_func.__name__ + inspect.formatargspec( *toolz_spec, **{'formatvalue': lambda x: '=' + getattr(x, '__name__', repr(x))} ) doc_alt = doc.replace('Py_ssize_t ', '') if not (toolz_sig in doc or toolz_sig_alt in doc_alt): message = ('cytoolz.%s does not have correct function signature.' '\n\nExpected: %s' '\n\nDocstring in cytoolz is:\n%s' % (key, toolz_sig, cytoolz_func.__doc__)) assert False, message
def test_curried_namespace(): exceptions = import_module('cytoolz.curried.exceptions') namespace = {} def should_curry(func): if not callable(func) or isinstance(func, cytoolz.curry): return False nargs = cytoolz.functoolz.num_required_args(func) if nargs is None or nargs > 1: return True return nargs == 1 and cytoolz.functoolz.has_keywords(func) def curry_namespace(ns): return dict( (name, cytoolz.curry(f) if should_curry(f) else f) for name, f in ns.items() if '__' not in name ) from_cytoolz = curry_namespace(vars(cytoolz)) from_exceptions = curry_namespace(vars(exceptions)) namespace.update(cytoolz.merge(from_cytoolz, from_exceptions)) namespace = cytoolz.valfilter(callable, namespace) curried_namespace = cytoolz.valfilter(callable, cytoolz.curried.__dict__) if namespace != curried_namespace: missing = set(namespace) - set(curried_namespace) if missing: raise AssertionError('There are missing functions in cytoolz.curried:\n %s' % ' \n'.join(sorted(missing))) extra = set(curried_namespace) - set(namespace) if extra: raise AssertionError('There are extra functions in cytoolz.curried:\n %s' % ' \n'.join(sorted(extra))) unequal = cytoolz.merge_with(list, namespace, curried_namespace) unequal = cytoolz.valfilter(lambda x: x[0] != x[1], unequal) messages = [] for name, (orig_func, auto_func) in sorted(unequal.items()): if name in from_exceptions: messages.append('%s should come from cytoolz.curried.exceptions' % name) elif should_curry(getattr(cytoolz, name)): messages.append('%s should be curried from cytoolz' % name) else: messages.append('%s should come from cytoolz and NOT be curried' % name) raise AssertionError('\n'.join(messages))
def get_timed_out(self) -> List[Hash32]: timed_out = cytoolz.valfilter( lambda v: time.time() - v[0] > self.reply_timeout, self.active_requests) for peer, (_, node_keys) in timed_out.items(): self.logger.debug("Timed out waiting for %d nodes from %s", len(node_keys), peer) self.active_requests = cytoolz.dissoc(self.active_requests, *timed_out.keys()) return list( cytoolz.concat(node_keys for _, node_keys in timed_out.values()))
def test_docstrings_uptodate(): import toolz differ = difflib.Differ() # only consider items created in both `toolz` and `cytoolz` toolz_dict = valfilter(isfrommod('toolz'), toolz.__dict__) cytoolz_dict = valfilter(isfrommod('cytoolz'), cytoolz.__dict__) # only test functions that have docstrings defined in `toolz` toolz_dict = valfilter(lambda x: getattr(x, '__doc__', ''), toolz_dict) # full API coverage should be tested elsewhere toolz_dict = keyfilter(lambda x: x in cytoolz_dict, toolz_dict) cytoolz_dict = keyfilter(lambda x: x in toolz_dict, cytoolz_dict) d = merge_with(identity, toolz_dict, cytoolz_dict) for key, (toolz_func, cytoolz_func) in d.items(): # only check if the new doctstring *contains* the expected docstring toolz_doc = convertdoc(toolz_func) cytoolz_doc = cytoolz_func.__doc__ if toolz_doc not in cytoolz_doc: diff = list( differ.compare(toolz_doc.splitlines(), cytoolz_doc.splitlines())) fulldiff = list(diff) # remove additional lines at the beginning while diff and diff[0].startswith('+'): diff.pop(0) # remove additional lines at the end while diff and diff[-1].startswith('+'): diff.pop() def checkbad(line): return (line.startswith('+') and not ('# doctest: +SKIP' in line and key in skipped_doctests)) if any(map(checkbad, diff)): assert False, 'Error: cytoolz.%s has a bad docstring:\n%s\n' % ( key, '\n'.join(fulldiff))
def test_docstrings_uptodate(): import toolz differ = difflib.Differ() # only consider items created in both `toolz` and `cytoolz` toolz_dict = valfilter(isfrommod('toolz'), toolz.__dict__) cytoolz_dict = valfilter(isfrommod('cytoolz'), cytoolz.__dict__) # only test functions that have docstrings defined in `toolz` toolz_dict = valfilter(lambda x: getattr(x, '__doc__', ''), toolz_dict) # full API coverage should be tested elsewhere toolz_dict = keyfilter(lambda x: x in cytoolz_dict, toolz_dict) cytoolz_dict = keyfilter(lambda x: x in toolz_dict, cytoolz_dict) d = merge_with(identity, toolz_dict, cytoolz_dict) for key, (toolz_func, cytoolz_func) in d.items(): # only check if the new doctstring *contains* the expected docstring toolz_doc = convertdoc(toolz_func) cytoolz_doc = cytoolz_func.__doc__ if toolz_doc not in cytoolz_doc: diff = list(differ.compare(toolz_doc.splitlines(), cytoolz_doc.splitlines())) fulldiff = list(diff) # remove additional lines at the beginning while diff and diff[0].startswith('+'): diff.pop(0) # remove additional lines at the end while diff and diff[-1].startswith('+'): diff.pop() def checkbad(line): return (line.startswith('+') and not ('# doctest: +SKIP' in line and key in skipped_doctests)) if any(map(checkbad, diff)): assert False, 'Error: cytoolz.%s has a bad docstring:\n%s\n' % ( key, '\n'.join(fulldiff))
def test_class_sigs(): """ Test that all ``cdef class`` extension types in ``cytoolz`` have correctly embedded the function signature as done in ``toolz``. """ import toolz # only consider items created in both `toolz` and `cytoolz` toolz_dict = valfilter(isfrommod('toolz'), toolz.__dict__) cytoolz_dict = valfilter(isfrommod('cytoolz'), cytoolz.__dict__) # only test `cdef class` extensions from `cytoolz` cytoolz_dict = valfilter(lambda x: not isinstance(x, BuiltinFunctionType), cytoolz_dict) # full API coverage should be tested elsewhere toolz_dict = keyfilter(lambda x: x in cytoolz_dict, toolz_dict) cytoolz_dict = keyfilter(lambda x: x in toolz_dict, cytoolz_dict) d = merge_with(identity, toolz_dict, cytoolz_dict) for key, (toolz_func, cytoolz_func) in d.items(): if key in ['excepts', 'juxt']: continue try: # function toolz_spec = inspect.getargspec(toolz_func) except TypeError: try: # curried or partial object toolz_spec = inspect.getargspec(toolz_func.func) except (TypeError, AttributeError): # class toolz_spec = inspect.getargspec(toolz_func.__init__) toolz_sig = toolz_func.__name__ + inspect.formatargspec(*toolz_spec) if toolz_sig not in cytoolz_func.__doc__: message = ('cytoolz.%s does not have correct function signature.' '\n\nExpected: %s' '\n\nDocstring in cytoolz is:\n%s' % (key, toolz_sig, cytoolz_func.__doc__)) assert False, message
def plot_community_labels(self, ax, level=None, ratio=None, offset=0.05): self.check_status() if ratio is None: ratio = self.node_ratio + offset if level is None: level = self.community_level if self.community_level else 0 community_ids = set(self.membership_per_level[level].values()) for c_id in community_ids: nodes_in_community = list( valfilter(lambda x: x == c_id, self.membership_per_level[level]).keys()) community_angles = [ self.node_angles_dict[n_id] for n_id in nodes_in_community ] community_angles = [ a if a >= 0 else a + 360 for a in community_angles ] community_angle = self.node_angles[int(c_id)] if community_angle < 0: community_angle += 360 min_angle = min(community_angles) max_angle = max(community_angles) mid_angle = 0.5 * (max_angle + min_angle) mid_angle_radians = np.radians(mid_angle) pos_x, pos_y = ratio * np.cos(mid_angle_radians), ratio * np.sin( mid_angle_radians) ha = 'left' if pos_x >= 0 else 'right' if mid_angle > 90: mid_angle = mid_angle - 180 elif mid_angle < -90: mid_angle = mid_angle + 180 ax.annotate(f'{c_id}', (pos_x, pos_y), rotation=mid_angle, ha=ha, va='center', rotation_mode='anchor', fontsize='small')
def test_class_sigs(): """ Test that all ``cdef class`` extension types in ``cytoolz`` have correctly embedded the function signature as done in ``toolz``. """ import toolz # only consider items created in both `toolz` and `cytoolz` toolz_dict = valfilter(isfrommod('toolz'), toolz.__dict__) cytoolz_dict = valfilter(isfrommod('cytoolz'), cytoolz.__dict__) # only test `cdef class` extensions from `cytoolz` cytoolz_dict = valfilter(lambda x: not isinstance(x, BuiltinFunctionType), cytoolz_dict) # full API coverage should be tested elsewhere toolz_dict = keyfilter(lambda x: x in cytoolz_dict, toolz_dict) cytoolz_dict = keyfilter(lambda x: x in toolz_dict, cytoolz_dict) d = merge_with(identity, toolz_dict, cytoolz_dict) for key, (toolz_func, cytoolz_func) in d.items(): try: # function toolz_spec = inspect.getargspec(toolz_func) except TypeError: try: # curried or partial object toolz_spec = inspect.getargspec(toolz_func.func) except (TypeError, AttributeError): # class toolz_spec = inspect.getargspec(toolz_func.__init__) toolz_sig = toolz_func.__name__ + inspect.formatargspec(*toolz_spec) if toolz_sig not in cytoolz_func.__doc__: message = ('cytoolz.%s does not have correct function signature.' '\n\nExpected: %s' '\n\nDocstring in cytoolz is:\n%s' % (key, toolz_sig, cytoolz_func.__doc__)) assert False, message
def merge_ngrams(self, files = None, n_gram_threshold = 1): all_ngrams = [] #Get a list of ngram files if files == None: files = self.Loader.list_output(type = "ngrams") #Break into lists of 20 files file_list = ct.partition_all(20, files) for files in file_list: ngrams = [] #Initialize holding list #Load for dict_file in files: try: ngrams.append(self.Loader.load_file(dict_file)) except: print("Not loading " + str(dict_file)) #Merge ngrams = ct.merge_with(sum, [x for x in ngrams]) print("\tSUB-TOTAL NGRAMS: " + str(len(list(ngrams.keys())))) print("\tSUB-TOTAL WORDS: " + str(ngrams["TOTAL"])) print("\n") all_ngrams.append(ngrams) #Now merge everything all_ngrams = ct.merge_with(sum, [x for x in all_ngrams]) print("\tTOTAL NGRAMS: " + str(len(list(all_ngrams.keys())))) print("\tTOTAL WORDS: " + str(all_ngrams["TOTAL"])) #Now enforce threshold keepable = lambda x: x > n_gram_threshold all_ngrams = ct.valfilter(keepable, all_ngrams) print("\tAfter pruning:") print("\tTOTAL NGRAMS: " + str(len(list(all_ngrams.keys())))) return all_ngrams
def shogun_bt2_lca(input, output, bt2_indx, extract_ncbi_tid, depth, threads, annotate_lineage, run_lca): verify_make_dir(output) basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')] for basename in basenames: fna_inf = os.path.join(input, basename + '.fna') sam_outf = os.path.join(output, basename + '.sam') if os.path.isfile(sam_outf): print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf) else: print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads)) if run_lca: tree = NCBITree() rank_name = list(tree.lineage_ranks.keys())[depth-1] if not rank_name: raise ValueError('Depth must be between 0 and 7, it was %d' % depth) begin, end = extract_ncbi_tid.split(',') counts = [] for basename in basenames: sam_file = os.path.join(output, basename + '.sam') lca_map = {} for qname, rname in yield_alignments_from_sam_inf(sam_file): ncbi_tid = int(find_between(rname, begin, end)) if qname in lca_map: current_ncbi_tid = lca_map[qname] if current_ncbi_tid: if current_ncbi_tid != ncbi_tid: lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid) else: lca_map[qname] = ncbi_tid if annotate_lineage: lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) taxon_counts = Counter(filter(None, lca_map.values())) else: lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map) taxon_counts = Counter(filter(None, lca_map.values())) counts.append(taxon_counts) df = pd.DataFrame(counts, index=basenames) df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
def _batch_import(base_class, cls, elements, fn): logging.debug('Trying to import {1} from {0} elements'.format( len(elements), cls)) internal_ids = set(pluck('id_str', fn(elements))) existing_users = cls.objects.filter(internal_id__in=internal_ids) existing_ids = set([u.internal_id for u in existing_users]) user_pks = dict([(u.internal_id, u.pk) for u in existing_users]) new_ids = internal_ids - existing_ids logging.debug('Existing IDs: {0}'.format(len(existing_ids))) logging.debug('New IDs: {0}'.format(len(new_ids))) added_keys = set() new_elements = [] for element in fn(elements): if element['id_str'] in user_pks: element['__pk__'] = user_pks[element['id_str']] element['__created__'] = False else: if not element['id_str'] in added_keys: user_model = cls() user_model.copy_json(valfilter(lambda x: x, element)) new_elements.append(user_model) element['__created__'] = True element['__pk__'] = None added_keys.add(element['id_str']) cls.objects.bulk_create(new_elements) new_models = list(cls.objects.filter(internal_id__in=new_ids)) logging.debug('New IDs created successfully: {0}'.format( len(new_models))) new_pks = dict([(u.internal_id, u.pk) for u in new_models]) for element in fn(elements): if element['id_str'] in new_pks: element['__pk__'] = new_pks[element['id_str']] return new_models
def process_file(self, filename, delta_threshold=0.05, freq_threshold=1, save=True): candidates = [] starting = time.time() #Initialize Beam Search class BS = BeamSearch(delta_threshold, self.association_dict) for line in self.Encoder.load_stream(filename): if len(line) > 2: #Beam Search extraction candidates += BS.beam_search(line) #Count each candidate, get dictionary with candidate frequencies candidates = ct.frequencies(candidates) print("\t" + str(len(candidates)) + " candidates before pruning.") #Reduce nonce candidates above_zero = lambda x: x > freq_threshold candidates = ct.valfilter(above_zero, candidates) #Print time and number of remaining candidates print("\t" + str(len(candidates)) + " candidates in " + str(time.time() - starting) + " seconds.") if save == True: self.Loader.save_file(candidates, filename + ".candidates.p") return os.path.join(self.Loader.output_dir, filename + ".candidates.p") else: return candidates
def get_top(self, association_dict, direction, number): #Make initial cuts without sorting to save time temp_dict = {key: association_dict[key][direction] for key in association_dict.keys()} current_threshold = 0.25 while True: above_threshold = lambda x: x > current_threshold temp_dict = ct.valfilter(above_threshold, temp_dict) if len(list(temp_dict.keys())) > 10000: current_threshold = current_threshold + 0.05 else: break #Sort and reduce return_list = [(key, value) for key, value in sorted(temp_dict.items(), key=lambda x: x[1], reverse = True)] return_list = return_list[0:number+1] for key, value in return_list: yield key, value
def merge_candidates(self, output_files, threshold): candidates = [] print("Merging " + str(len(output_files)) + " files.") #Load for dict_file in output_files: try: candidates.append(self.Loader.load_file(dict_file)) except Exception as e: print("ERROR") print(e) #Merge candidates = ct.merge_with(sum, [x for x in candidates]) print("\tTOTAL CANDIDATES BEFORE PRUNING: " + str(len(list(candidates.keys())))) #Prune above_threshold = lambda x: x > threshold candidates = ct.valfilter(above_threshold, candidates) print("\tTOTAL CANDIDATES AFTER PRUNING: " + str(len(list(candidates.keys())))) return candidates
def _batch_import(base_class, cls, elements, fn): logging.debug('Trying to import {1} from {0} elements'.format(len(elements), cls)) internal_ids = set(pluck('id_str', fn(elements))) existing_users = cls.objects.filter(internal_id__in=internal_ids) existing_ids = set([u.internal_id for u in existing_users]) user_pks = dict([(u.internal_id, u.pk) for u in existing_users]) new_ids = internal_ids - existing_ids logging.debug('Existing IDs: {0}'.format(len(existing_ids))) logging.debug('New IDs: {0}'.format(len(new_ids))) added_keys = set() new_elements = [] for element in fn(elements): if element['id_str'] in user_pks: element['__pk__'] = user_pks[element['id_str']] element['__created__'] = False else: if not element['id_str'] in added_keys: user_model = cls() user_model.copy_json(valfilter(lambda x: x, element)) new_elements.append(user_model) element['__created__'] = True element['__pk__'] = None added_keys.add(element['id_str']) cls.objects.bulk_create(new_elements) new_models = list(cls.objects.filter(internal_id__in=new_ids)) logging.debug('New IDs created successfully: {0}'.format(len(new_models))) new_pks = dict([(u.internal_id, u.pk) for u in new_models]) for element in fn(elements): if element['id_str'] in new_pks: element['__pk__'] = new_pks[element['id_str']] return new_models
def shogun_bt2_lca(input, output, bt2_indx, extract_ncbi_tid, depth, threads, annotate_lineage, run_lca): verify_make_dir(output) basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')] for basename in basenames: fna_inf = os.path.join(input, basename + '.fna') sam_outf = os.path.join(output, basename + '.sam') if os.path.isfile(sam_outf): print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf) else: print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads)) if run_lca: tree = NCBITree() rank_name = list(tree.lineage_ranks.keys())[depth-1] if not rank_name: raise ValueError('Depth must be between 0 and 7, it was %d' % depth) begin, end = extract_ncbi_tid.split(',') counts = [] for basename in basenames: sam_file = os.path.join(output, basename + '.sam') lca_map = build_lca_map(sam_file, lambda x: int(find_between(x, begin, end)), tree) if annotate_lineage: lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) taxon_counts = Counter(filter(None, lca_map.values())) else: lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map) taxon_counts = Counter(filter(None, lca_map.values())) counts.append(taxon_counts) df = pd.DataFrame(counts, index=basenames) df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
def test_sig_at_beginning(): """ Test that the function signature is at the beginning of the docstring and is followed by exactly one blank line. """ cytoolz_dict = valfilter(isfrommod('cytoolz'), cytoolz.__dict__) cytoolz_dict = keyfilter(lambda x: x not in skip_sigs, cytoolz_dict) for key, val in cytoolz_dict.items(): doclines = val.__doc__.splitlines() assert len(doclines) > 2, ( 'cytoolz.%s docstring too short:\n\n%s' % (key, val.__doc__)) sig = '%s(' % aliases.get(key, key) assert sig in doclines[0], ( 'cytoolz.%s docstring missing signature at beginning:\n\n%s' % (key, val.__doc__)) assert not doclines[1], ( 'cytoolz.%s docstring missing blank line after signature:\n\n%s' % (key, val.__doc__)) assert doclines[2], ( 'cytoolz.%s docstring too many blank lines after signature:\n\n%s' % (key, val.__doc__))
def shogun_functional(input, output, bt2_indx, extract_ncbi_tid, threads): verify_make_dir(output) basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')] # Create a SAM file for each input FASTA file for basename in basenames: fna_inf = os.path.join(input, basename + '.fna') sam_outf = os.path.join(output, basename + '.sam') if os.path.isfile(sam_outf): print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf) else: print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads)) img_map = IMGMap() for basename in basenames: sam_inf = os.path.join(output, basename + '.sam') step_outf = 'test' if os.path.isfile(step_outf): print("Found the \"%s.kegg.csv\". Skipping the LCA phase for this file." % step_outf) else: lca_map = build_img_ncbi_map(yield_alignments_from_sam_inf(sam_inf), ) sam_files = [os.path.join(args.input, filename) for filename in os.listdir(args.input) if filename.endswith('.sam')] img_map = IMGMap() ncbi_tree = NCBITree() lca = LCA(ncbi_tree, args.depth) with open(args.output, 'w') if args.output else sys.stdout as outf: csv_outf = csv.writer(outf, quoting=csv.QUOTE_ALL, lineterminator='\n') csv_outf.writerow(['sample_id', 'sequence_id', 'ncbi_tid', 'img_id']) for file in sam_files: with open(file) as inf: lca_map = build_lca_map(yield_alignments_from_sam_inf(inf), lca, img_map) for key in lca_map: img_ids, ncbi_tid = lca_map[key] csv_outf.writerow([os.path.basename(file).split('.')[0], key, ncbi_tid, ','.join(img_ids)]) if run_lca: tree = NCBITree() rank_name = list(tree.lineage_ranks.keys())[depth - 1] if not rank_name: raise ValueError('Depth must be between 0 and 7, it was %d' % depth) begin, end = extract_ncbi_tid.split(',') counts = [] for basename in basenames: sam_file = os.path.join(output, basename + '.sam') lca_map = {} for qname, rname in yield_alignments_from_sam_inf(sam_file): ncbi_tid = int(find_between(rname, begin, end)) if qname in lca_map: current_ncbi_tid = lca_map[qname] if current_ncbi_tid: if current_ncbi_tid != ncbi_tid: lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid) else: lca_map[qname] = ncbi_tid if annotate_lineage: lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) taxon_counts = Counter(filter(None, lca_map.values())) else: lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map) taxon_counts = Counter(filter(None, lca_map.values())) counts.append(taxon_counts) df = pd.DataFrame(counts, index=basenames) df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
def valfilter(self, predicate): return fdict(cytoolz.valfilter(predicate, self))
def plot_community_wedges( self, ax, level=1, wedge_width=0.5, wedge_ratio=None, wedge_offset=0.05, alpha=1.0, fill_gaps=False, palette="plasma", label_func=None, ): if wedge_ratio is None: wedge_ratio = self.node_ratio + wedge_offset community_ids = sorted(set(self.membership_per_level[level].values())) community_colors = dict( zip(community_ids, sns.color_palette(palette, n_colors=len(community_ids)))) wedge_meta = [] wedge_gap = 180 / self.network.num_vertices() if fill_gaps else 0 # fom https://matplotlib.org/stable/gallery/pie_and_polar_charts/pie_and_donut_labels.html bbox_props = dict(boxstyle="square,pad=0.3", fc="none", ec="none") kw = dict( arrowprops=dict(arrowstyle="-", color="#abacab"), bbox=bbox_props, zorder=0, va="center", fontsize=8, ) for c_id in community_ids: nodes_in_community = list( valfilter(lambda x: x == c_id, self.membership_per_level[level]).keys()) community_angles = [ self.node_angles_dict[n_id] for n_id in nodes_in_community ] community_angles = [ a if a >= 0 else a + 360 for a in community_angles ] community_angle = self.node_angles_dict[int(c_id)] if community_angle < 0: community_angle += 360 min_angle = min(community_angles) max_angle = max(community_angles) extent_angle = max_angle - min_angle if extent_angle < 0: min_angle, max_angle = max_angle, min_angle if fill_gaps: min_angle -= wedge_gap max_angle += wedge_gap wedge_meta.append({ "community_id": c_id, "n_nodes": len(nodes_in_community), "center_angle": community_angle, "extent_angle": extent_angle, "min_angle": min_angle, "max_angle": max_angle, "color": community_colors[c_id], }) if label_func is not None: community_label = label_func(c_id) if community_label: ratio = wedge_ratio + wedge_width mid_angle = 0.5 * (max_angle + min_angle) mid_angle_radians = np.radians(mid_angle) pos_x, pos_y = ratio * np.cos( mid_angle_radians), ratio * np.sin(mid_angle_radians) horizontalalignment = { -1: "right", 1: "left" }[int(np.sign(pos_x))] connectionstyle = "angle,angleA=0,angleB={}".format( mid_angle) kw["arrowprops"].update( {"connectionstyle": connectionstyle}) ax.annotate( community_label, xy=(pos_x, pos_y), xytext=(1.35 * pos_x, 1.4 * pos_y), horizontalalignment=horizontalalignment, **kw, ) collection = [ Wedge( 0.0, wedge_ratio + wedge_width, w["min_angle"], w["max_angle"], width=wedge_width, ) for w in wedge_meta ] ax.add_collection( PatchCollection( collection, edgecolor="none", color=[w["color"] for w in wedge_meta], alpha=alpha, )) return wedge_meta, collection
def plot_community_wedges(self, ax, level=None, wedge_width=0.5, wedge_ratio=None, wedge_offset=0.05, alpha=1.0, fill_gaps=False, palette='plasma'): self.check_status() if wedge_ratio is None: wedge_ratio = self.node_ratio + wedge_offset if level is None: level = self.community_level community_ids = set(self.membership_per_level[level].values()) community_colors = sns.color_palette(palette, n_colors=len(community_ids)) wedge_meta = [] wedge_gap = 180 / self.network.num_vertices() if fill_gaps else 0 for c_id in community_ids: nodes_in_community = list( valfilter(lambda x: x == c_id, self.membership_per_level[level]).keys()) community_angles = [ self.node_angles_dict[n_id] for n_id in nodes_in_community ] community_angles = [ a if a >= 0 else a + 360 for a in community_angles ] community_angle = self.node_angles[int(c_id)] if community_angle < 0: community_angle += 360 min_angle = min(community_angles) max_angle = max(community_angles) extent_angle = max_angle - min_angle if extent_angle < 0: min_angle, max_angle = max_angle, min_angle if fill_gaps: min_angle -= wedge_gap max_angle += wedge_gap wedge_meta.append({ 'n_nodes': len(nodes_in_community), 'center_angle': community_angle, 'extent_angle': extent_angle, 'min_angle': min_angle, 'max_angle': max_angle, 'color': community_colors[c_id] }) collection = [ Wedge(0.0, wedge_ratio + wedge_width, w['min_angle'], w['max_angle'], width=wedge_width) for w in wedge_meta ] ax.add_collection( PatchCollection(collection, edgecolor='none', color=[w['color'] for w in wedge_meta], alpha=alpha)) return wedge_meta, collection
def test_curried_namespace(): namespace = {} def should_curry(func): if not callable(func) or isinstance(func, curry): return False nargs = num_required_args(func) if nargs is None or nargs > 1: return True else: return nargs == 1 and has_keywords(func) def curry_namespace(ns): return dict( ( name, curry(f) if should_curry(f) else f, ) for name, f in ns.items() if '__' not in name ) all_auto_curried = curry_namespace(vars(eth_utils)) inferred_namespace = valfilter(callable, all_auto_curried) curried_namespace = valfilter(callable, eth_utils.curried.__dict__) if inferred_namespace != curried_namespace: missing = set(inferred_namespace) - set(curried_namespace) if missing: to_insert = sorted("%s," % f for f in missing) raise AssertionError( 'There are missing functions in eth_utils.curried:\n' + '\n'.join(to_insert) ) extra = set(curried_namespace) - set(inferred_namespace) if extra: raise AssertionError( 'There are extra functions in eth_utils.curried:\n' + '\n'.join(sorted(extra)) ) unequal = merge_with(list, inferred_namespace, curried_namespace) unequal = valfilter(lambda x: x[0] != x[1], unequal) to_curry = keyfilter(lambda x: should_curry(getattr(eth_utils, x)), unequal) if to_curry: to_curry_formatted = sorted('{0} = curry({0})'.format(f) for f in to_curry) raise AssertionError( 'There are missing functions to curry in eth_utils.curried:\n' + '\n'.join(to_curry_formatted) ) elif unequal: not_to_curry_formatted = sorted(unequal) raise AssertionError( 'Missing functions NOT to curry in eth_utils.curried:\n' + '\n'.join(not_to_curry_formatted) ) else: raise AssertionError("unexplained difference between %r and %r" % ( inferred_namespace, curried_namespace, ))
def named_clause_groups(self)->dict: """Dict of subgroup of clauses contained directly in this group with name as key. :rtype: Dict[str, ClauseGroup] """ return valfilter(lambda md: isinstance(md, ClauseGroup), self.named_children())
def get_students(): load_data() return {k:v.json for k,v in t.valfilter(lambda v: not v.deleted, data['students']).iteritems()}
def process_ngrams(self, filename, Encoder, save = False): print("\t\tStarting " + filename) #Initialize bigram dictionary ngrams = defaultdict(int) unigrams = defaultdict(int) starting = time.time() total = 0 for line in Encoder.load_stream(filename): total += len(line) #Store unigrams for item in line: unigrams[(1, item[0])] += 1 unigrams[(2, item[1])] += 1 unigrams[(3, item[2])] += 1 try: for bigram in ct.sliding_window(2, line): #Tuples are indexes for (LEX, POS, CAT) #Index types are 1 (LEX), 2 (POS), 3 (CAT) ngrams[((1, bigram[0][0]), (1, bigram[1][0]))] += 1 #lex_lex ngrams[((1, bigram[0][0]), (2, bigram[1][1]))] += 1 #lex_pos ngrams[((1, bigram[0][0]), (3, bigram[1][2]))] += 1 #lex_cat ngrams[((2, bigram[0][1]), (2, bigram[1][1]))] += 1 #pos_pos ngrams[((2, bigram[0][1]), (1, bigram[1][0]))] += 1 #pos_lex ngrams[((2, bigram[0][1]), (3, bigram[1][2]))] += 1 #pos_cat ngrams[((3, bigram[0][2]), (3, bigram[1][2]))] += 1 #cat_cat ngrams[((3, bigram[0][2]), (2, bigram[1][1]))] += 1 #cat_pos ngrams[((3, bigram[0][2]), (1, bigram[1][0]))] += 1 #cat_lex #Catch errors from empty lines coming out of the encoder except Exception as e: error = e #Reduce nonce ngrams size = len(list(ngrams.keys())) keepable = lambda x: x > 1 ngrams = ct.valfilter(keepable, ngrams) #Note: Keep all unigrams, they are already limited by the lexicon #Reduce null indexes ngrams = {key: ngrams[key] for key in list(ngrams.keys()) if 0 not in key[0] and 0 not in key[1]} unigrams = {key: unigrams[key] for key in list(unigrams.keys()) if 0 not in key} ngrams = ct.merge([ngrams, unigrams]) ngrams["TOTAL"] = total del unigrams #Print status print("\tTime: ", end = "") print(time.time() - starting, end = "") print(" Full: " + str(size) + " ", end = "") print(" Reduced: ", end = "") print(len(list(ngrams.keys())), end = "") print(" with " + str(ngrams["TOTAL"]) + " words.") if save == True: self.Loader.save_file(ngrams, filename + ".ngrams.p") return os.path.join(self.Loader.output_dir, filename + ".ngrams.p") else: return ngrams
def get_users(): return {k:v.json for k,v in t.valfilter(lambda v: not v.deleted, data['users']).iteritems()}