Example #1
0
	def process_file(self, filename, delta_threshold = 0.05, freq_threshold = 1, save = True):
		
		candidates = []
		starting = time.time()
		
		#Initialize Beam Search class
		BS = BeamSearch(delta_threshold, self.association_dict)
		
		for line in self.Encoder.load_stream(filename):

			if len(line) > 2:
				
				#Beam Search extraction
				candidates += BS.beam_search(line)
			
		#Count each candidate, get dictionary with candidate frequencies
		candidates = ct.frequencies(candidates)
		print("\t" + str(len(candidates)) + " candidates before pruning.")
		
		#Reduce nonce candidates
		above_zero = lambda x: x > freq_threshold
		candidates = ct.valfilter(above_zero, candidates)		
			
		#Print time and number of remaining candidates
		print("\t" + str(len(candidates)) + " candidates in " + str(time.time() - starting) + " seconds.")
	
		if save == True:
			self.Loader.save_file(candidates, filename + ".candidates.p")
			return os.path.join(self.Loader.output_dir, filename + ".candidates.p")
				
		else:
			return candidates
Example #2
0
def test_class_sigs():
    """ Test that all ``cdef class`` extension types in ``cytoolz`` have
        correctly embedded the function signature as done in ``toolz``.
    """
    import toolz
    # only consider items created in both `toolz` and `cytoolz`
    toolz_dict = valfilter(isfrommod('toolz'), toolz.__dict__)
    cytoolz_dict = valfilter(isfrommod('cytoolz'), cytoolz.__dict__)

    # only test `cdef class` extensions from `cytoolz`
    cytoolz_dict = valfilter(lambda x: not isinstance(x, BuiltinFunctionType),
                             cytoolz_dict)

    # full API coverage should be tested elsewhere
    toolz_dict = keyfilter(lambda x: x in cytoolz_dict, toolz_dict)
    cytoolz_dict = keyfilter(lambda x: x in toolz_dict, cytoolz_dict)

    d = merge_with(identity, toolz_dict, cytoolz_dict)
    for key, (toolz_func, cytoolz_func) in d.items():
        if key in ['excepts', 'juxt', 'memoize', 'flip']:
            continue
        try:
            # function
            toolz_spec = inspect.getargspec(toolz_func)
        except TypeError:
            try:
                # curried or partial object
                toolz_spec = inspect.getargspec(toolz_func.func)
            except (TypeError, AttributeError):
                # class
                toolz_spec = inspect.getargspec(toolz_func.__init__)

        # For Cython < 0.25
        toolz_sig = toolz_func.__name__ + inspect.formatargspec(*toolz_spec)
        doc = cytoolz_func.__doc__
        # For Cython >= 0.25
        toolz_sig_alt = toolz_func.__name__ + inspect.formatargspec(
            *toolz_spec,
            **{'formatvalue': lambda x: '=' + getattr(x, '__name__', repr(x))}
        )
        doc_alt = doc.replace('Py_ssize_t ', '')
        if not (toolz_sig in doc or toolz_sig_alt in doc_alt):
            message = ('cytoolz.%s does not have correct function signature.'
                       '\n\nExpected: %s'
                       '\n\nDocstring in cytoolz is:\n%s'
                       % (key, toolz_sig, cytoolz_func.__doc__))
            assert False, message
Example #3
0
def test_curried_namespace():
    exceptions = import_module('cytoolz.curried.exceptions')
    namespace = {}

    def should_curry(func):
        if not callable(func) or isinstance(func, cytoolz.curry):
            return False
        nargs = cytoolz.functoolz.num_required_args(func)
        if nargs is None or nargs > 1:
            return True
        return nargs == 1 and cytoolz.functoolz.has_keywords(func)


    def curry_namespace(ns):
        return dict(
            (name, cytoolz.curry(f) if should_curry(f) else f)
            for name, f in ns.items() if '__' not in name
        )

    from_cytoolz = curry_namespace(vars(cytoolz))
    from_exceptions = curry_namespace(vars(exceptions))
    namespace.update(cytoolz.merge(from_cytoolz, from_exceptions))

    namespace = cytoolz.valfilter(callable, namespace)
    curried_namespace = cytoolz.valfilter(callable, cytoolz.curried.__dict__)

    if namespace != curried_namespace:
        missing = set(namespace) - set(curried_namespace)
        if missing:
            raise AssertionError('There are missing functions in cytoolz.curried:\n    %s'
                                 % '    \n'.join(sorted(missing)))
        extra = set(curried_namespace) - set(namespace)
        if extra:
            raise AssertionError('There are extra functions in cytoolz.curried:\n    %s'
                                 % '    \n'.join(sorted(extra)))
        unequal = cytoolz.merge_with(list, namespace, curried_namespace)
        unequal = cytoolz.valfilter(lambda x: x[0] != x[1], unequal)
        messages = []
        for name, (orig_func, auto_func) in sorted(unequal.items()):
            if name in from_exceptions:
                messages.append('%s should come from cytoolz.curried.exceptions' % name)
            elif should_curry(getattr(cytoolz, name)):
                messages.append('%s should be curried from cytoolz' % name)
            else:
                messages.append('%s should come from cytoolz and NOT be curried' % name)
        raise AssertionError('\n'.join(messages))
Example #4
0
 def get_timed_out(self) -> List[Hash32]:
     timed_out = cytoolz.valfilter(
         lambda v: time.time() - v[0] > self.reply_timeout,
         self.active_requests)
     for peer, (_, node_keys) in timed_out.items():
         self.logger.debug("Timed out waiting for %d nodes from %s",
                           len(node_keys), peer)
     self.active_requests = cytoolz.dissoc(self.active_requests,
                                           *timed_out.keys())
     return list(
         cytoolz.concat(node_keys for _, node_keys in timed_out.values()))
Example #5
0
def test_docstrings_uptodate():
    import toolz
    differ = difflib.Differ()

    # only consider items created in both `toolz` and `cytoolz`
    toolz_dict = valfilter(isfrommod('toolz'), toolz.__dict__)
    cytoolz_dict = valfilter(isfrommod('cytoolz'), cytoolz.__dict__)

    # only test functions that have docstrings defined in `toolz`
    toolz_dict = valfilter(lambda x: getattr(x, '__doc__', ''), toolz_dict)

    # full API coverage should be tested elsewhere
    toolz_dict = keyfilter(lambda x: x in cytoolz_dict, toolz_dict)
    cytoolz_dict = keyfilter(lambda x: x in toolz_dict, cytoolz_dict)

    d = merge_with(identity, toolz_dict, cytoolz_dict)
    for key, (toolz_func, cytoolz_func) in d.items():
        # only check if the new doctstring *contains* the expected docstring
        toolz_doc = convertdoc(toolz_func)
        cytoolz_doc = cytoolz_func.__doc__
        if toolz_doc not in cytoolz_doc:
            diff = list(
                differ.compare(toolz_doc.splitlines(),
                               cytoolz_doc.splitlines()))
            fulldiff = list(diff)
            # remove additional lines at the beginning
            while diff and diff[0].startswith('+'):
                diff.pop(0)
            # remove additional lines at the end
            while diff and diff[-1].startswith('+'):
                diff.pop()

            def checkbad(line):
                return (line.startswith('+')
                        and not ('# doctest: +SKIP' in line
                                 and key in skipped_doctests))

            if any(map(checkbad, diff)):
                assert False, 'Error: cytoolz.%s has a bad docstring:\n%s\n' % (
                    key, '\n'.join(fulldiff))
Example #6
0
def test_docstrings_uptodate():
    import toolz
    differ = difflib.Differ()

    # only consider items created in both `toolz` and `cytoolz`
    toolz_dict = valfilter(isfrommod('toolz'), toolz.__dict__)
    cytoolz_dict = valfilter(isfrommod('cytoolz'), cytoolz.__dict__)

    # only test functions that have docstrings defined in `toolz`
    toolz_dict = valfilter(lambda x: getattr(x, '__doc__', ''), toolz_dict)

    # full API coverage should be tested elsewhere
    toolz_dict = keyfilter(lambda x: x in cytoolz_dict, toolz_dict)
    cytoolz_dict = keyfilter(lambda x: x in toolz_dict, cytoolz_dict)

    d = merge_with(identity, toolz_dict, cytoolz_dict)
    for key, (toolz_func, cytoolz_func) in d.items():
        # only check if the new doctstring *contains* the expected docstring
        toolz_doc = convertdoc(toolz_func)
        cytoolz_doc = cytoolz_func.__doc__
        if toolz_doc not in cytoolz_doc:
            diff = list(differ.compare(toolz_doc.splitlines(),
                                       cytoolz_doc.splitlines()))
            fulldiff = list(diff)
            # remove additional lines at the beginning
            while diff and diff[0].startswith('+'):
                diff.pop(0)
            # remove additional lines at the end
            while diff and diff[-1].startswith('+'):
                diff.pop()

            def checkbad(line):
                return (line.startswith('+') and
                        not ('# doctest: +SKIP' in line and
                             key in skipped_doctests))

            if any(map(checkbad, diff)):
                assert False, 'Error: cytoolz.%s has a bad docstring:\n%s\n' % (
                    key, '\n'.join(fulldiff))
Example #7
0
def test_class_sigs():
    """ Test that all ``cdef class`` extension types in ``cytoolz`` have
        correctly embedded the function signature as done in ``toolz``.
    """
    import toolz
    # only consider items created in both `toolz` and `cytoolz`
    toolz_dict = valfilter(isfrommod('toolz'), toolz.__dict__)
    cytoolz_dict = valfilter(isfrommod('cytoolz'), cytoolz.__dict__)

    # only test `cdef class` extensions from `cytoolz`
    cytoolz_dict = valfilter(lambda x: not isinstance(x, BuiltinFunctionType),
                             cytoolz_dict)

    # full API coverage should be tested elsewhere
    toolz_dict = keyfilter(lambda x: x in cytoolz_dict, toolz_dict)
    cytoolz_dict = keyfilter(lambda x: x in toolz_dict, cytoolz_dict)

    d = merge_with(identity, toolz_dict, cytoolz_dict)
    for key, (toolz_func, cytoolz_func) in d.items():
        if key in ['excepts', 'juxt']:
            continue
        try:
            # function
            toolz_spec = inspect.getargspec(toolz_func)
        except TypeError:
            try:
                # curried or partial object
                toolz_spec = inspect.getargspec(toolz_func.func)
            except (TypeError, AttributeError):
                # class
                toolz_spec = inspect.getargspec(toolz_func.__init__)

        toolz_sig = toolz_func.__name__ + inspect.formatargspec(*toolz_spec)
        if toolz_sig not in cytoolz_func.__doc__:
            message = ('cytoolz.%s does not have correct function signature.'
                       '\n\nExpected: %s'
                       '\n\nDocstring in cytoolz is:\n%s'
                       % (key, toolz_sig, cytoolz_func.__doc__))
            assert False, message
Example #8
0
    def plot_community_labels(self, ax, level=None, ratio=None, offset=0.05):
        self.check_status()

        if ratio is None:
            ratio = self.node_ratio + offset

        if level is None:
            level = self.community_level if self.community_level else 0

        community_ids = set(self.membership_per_level[level].values())

        for c_id in community_ids:

            nodes_in_community = list(
                valfilter(lambda x: x == c_id,
                          self.membership_per_level[level]).keys())

            community_angles = [
                self.node_angles_dict[n_id] for n_id in nodes_in_community
            ]
            community_angles = [
                a if a >= 0 else a + 360 for a in community_angles
            ]
            community_angle = self.node_angles[int(c_id)]

            if community_angle < 0:
                community_angle += 360

            min_angle = min(community_angles)
            max_angle = max(community_angles)

            mid_angle = 0.5 * (max_angle + min_angle)
            mid_angle_radians = np.radians(mid_angle)

            pos_x, pos_y = ratio * np.cos(mid_angle_radians), ratio * np.sin(
                mid_angle_radians)

            ha = 'left' if pos_x >= 0 else 'right'

            if mid_angle > 90:
                mid_angle = mid_angle - 180
            elif mid_angle < -90:
                mid_angle = mid_angle + 180

            ax.annotate(f'{c_id}', (pos_x, pos_y),
                        rotation=mid_angle,
                        ha=ha,
                        va='center',
                        rotation_mode='anchor',
                        fontsize='small')
Example #9
0
def test_class_sigs():
    """ Test that all ``cdef class`` extension types in ``cytoolz`` have
        correctly embedded the function signature as done in ``toolz``.
    """
    import toolz
    # only consider items created in both `toolz` and `cytoolz`
    toolz_dict = valfilter(isfrommod('toolz'), toolz.__dict__)
    cytoolz_dict = valfilter(isfrommod('cytoolz'), cytoolz.__dict__)

    # only test `cdef class` extensions from `cytoolz`
    cytoolz_dict = valfilter(lambda x: not isinstance(x, BuiltinFunctionType),
                             cytoolz_dict)

    # full API coverage should be tested elsewhere
    toolz_dict = keyfilter(lambda x: x in cytoolz_dict, toolz_dict)
    cytoolz_dict = keyfilter(lambda x: x in toolz_dict, cytoolz_dict)

    d = merge_with(identity, toolz_dict, cytoolz_dict)
    for key, (toolz_func, cytoolz_func) in d.items():
        try:
            # function
            toolz_spec = inspect.getargspec(toolz_func)
        except TypeError:
            try:
                # curried or partial object
                toolz_spec = inspect.getargspec(toolz_func.func)
            except (TypeError, AttributeError):
                # class
                toolz_spec = inspect.getargspec(toolz_func.__init__)

        toolz_sig = toolz_func.__name__ + inspect.formatargspec(*toolz_spec)
        if toolz_sig not in cytoolz_func.__doc__:
            message = ('cytoolz.%s does not have correct function signature.'
                       '\n\nExpected: %s'
                       '\n\nDocstring in cytoolz is:\n%s'
                       % (key, toolz_sig, cytoolz_func.__doc__))
            assert False, message
Example #10
0
	def merge_ngrams(self, files = None, n_gram_threshold = 1):
		
		all_ngrams = []
		
		#Get a list of ngram files
		if files == None:
			files = self.Loader.list_output(type = "ngrams")
			
		#Break into lists of 20 files
		file_list = ct.partition_all(20, files)
		
		for files in file_list:
			
			ngrams = []		#Initialize holding list
			
			#Load
			for dict_file in files:
				try:
					ngrams.append(self.Loader.load_file(dict_file))
				except:
					print("Not loading " + str(dict_file))
		
			#Merge
			ngrams = ct.merge_with(sum, [x for x in ngrams])
		
			print("\tSUB-TOTAL NGRAMS: " + str(len(list(ngrams.keys()))))
			print("\tSUB-TOTAL WORDS: " + str(ngrams["TOTAL"]))
			print("\n")
			
			all_ngrams.append(ngrams)
			
		#Now merge everything
		all_ngrams = ct.merge_with(sum, [x for x in all_ngrams])
		
		print("\tTOTAL NGRAMS: " + str(len(list(all_ngrams.keys()))))
		print("\tTOTAL WORDS: " + str(all_ngrams["TOTAL"]))
		
		#Now enforce threshold
		keepable = lambda x: x > n_gram_threshold
		all_ngrams = ct.valfilter(keepable, all_ngrams)
		
		print("\tAfter pruning:")
		print("\tTOTAL NGRAMS: " + str(len(list(all_ngrams.keys()))))
		
		return all_ngrams
def shogun_bt2_lca(input, output, bt2_indx, extract_ncbi_tid, depth, threads, annotate_lineage, run_lca):
    verify_make_dir(output)

    basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')]

    for basename in basenames:
        fna_inf = os.path.join(input, basename + '.fna')
        sam_outf = os.path.join(output, basename + '.sam')
        if os.path.isfile(sam_outf):
            print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf)
        else:
            print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads))
    
    if run_lca:
        tree = NCBITree()
        rank_name = list(tree.lineage_ranks.keys())[depth-1]
        if not rank_name:
            raise ValueError('Depth must be between 0 and 7, it was %d' % depth)

        begin, end = extract_ncbi_tid.split(',')

        counts = []
        for basename in basenames:
            sam_file = os.path.join(output, basename + '.sam')
            lca_map = {}
            for qname, rname in yield_alignments_from_sam_inf(sam_file):
                ncbi_tid = int(find_between(rname, begin, end))
                if qname in lca_map:
                    current_ncbi_tid = lca_map[qname]
                    if current_ncbi_tid:
                        if current_ncbi_tid != ncbi_tid:
                            lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid)
                else:
                    lca_map[qname] = ncbi_tid

            if annotate_lineage:
                lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            else:
                lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            counts.append(taxon_counts)

        df = pd.DataFrame(counts, index=basenames)
        df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
Example #12
0
    def _batch_import(base_class, cls, elements, fn):
        logging.debug('Trying to import {1} from {0} elements'.format(
            len(elements), cls))
        internal_ids = set(pluck('id_str', fn(elements)))

        existing_users = cls.objects.filter(internal_id__in=internal_ids)
        existing_ids = set([u.internal_id for u in existing_users])
        user_pks = dict([(u.internal_id, u.pk) for u in existing_users])
        new_ids = internal_ids - existing_ids

        logging.debug('Existing IDs: {0}'.format(len(existing_ids)))
        logging.debug('New IDs: {0}'.format(len(new_ids)))

        added_keys = set()
        new_elements = []
        for element in fn(elements):
            if element['id_str'] in user_pks:
                element['__pk__'] = user_pks[element['id_str']]
                element['__created__'] = False
            else:
                if not element['id_str'] in added_keys:
                    user_model = cls()
                    user_model.copy_json(valfilter(lambda x: x, element))
                    new_elements.append(user_model)
                element['__created__'] = True
                element['__pk__'] = None
                added_keys.add(element['id_str'])

        cls.objects.bulk_create(new_elements)

        new_models = list(cls.objects.filter(internal_id__in=new_ids))
        logging.debug('New IDs created successfully: {0}'.format(
            len(new_models)))
        new_pks = dict([(u.internal_id, u.pk) for u in new_models])
        for element in fn(elements):
            if element['id_str'] in new_pks:
                element['__pk__'] = new_pks[element['id_str']]

        return new_models
Example #13
0
    def process_file(self,
                     filename,
                     delta_threshold=0.05,
                     freq_threshold=1,
                     save=True):

        candidates = []
        starting = time.time()

        #Initialize Beam Search class
        BS = BeamSearch(delta_threshold, self.association_dict)

        for line in self.Encoder.load_stream(filename):

            if len(line) > 2:

                #Beam Search extraction
                candidates += BS.beam_search(line)

        #Count each candidate, get dictionary with candidate frequencies
        candidates = ct.frequencies(candidates)
        print("\t" + str(len(candidates)) + " candidates before pruning.")

        #Reduce nonce candidates
        above_zero = lambda x: x > freq_threshold
        candidates = ct.valfilter(above_zero, candidates)

        #Print time and number of remaining candidates
        print("\t" + str(len(candidates)) + " candidates in " +
              str(time.time() - starting) + " seconds.")

        if save == True:
            self.Loader.save_file(candidates, filename + ".candidates.p")
            return os.path.join(self.Loader.output_dir,
                                filename + ".candidates.p")

        else:
            return candidates
Example #14
0
	def get_top(self, association_dict, direction, number):
		
		#Make initial cuts without sorting to save time
		temp_dict = {key: association_dict[key][direction] for key in association_dict.keys()}
		current_threshold = 0.25
		
		while True:
		
			above_threshold = lambda x: x > current_threshold
			temp_dict = ct.valfilter(above_threshold, temp_dict)
			
			if len(list(temp_dict.keys())) > 10000:
				current_threshold = current_threshold + 0.05
				
			else:
				break
		
		#Sort and reduce
		return_list = [(key, value) for key, value in sorted(temp_dict.items(), key=lambda x: x[1], reverse = True)]
		return_list = return_list[0:number+1]

		for key, value in return_list:
			yield key, value
Example #15
0
	def merge_candidates(self, output_files, threshold):
		
		candidates = []
		print("Merging " + str(len(output_files)) + " files.")
		
		#Load
		for dict_file in output_files:
			try:
				candidates.append(self.Loader.load_file(dict_file))
			except Exception as e:
				print("ERROR")
				print(e)
		
		#Merge
		candidates = ct.merge_with(sum, [x for x in candidates])
		print("\tTOTAL CANDIDATES BEFORE PRUNING: " + str(len(list(candidates.keys()))))
		
		#Prune
		above_threshold = lambda x: x > threshold
		candidates = ct.valfilter(above_threshold, candidates)
		print("\tTOTAL CANDIDATES AFTER PRUNING: " + str(len(list(candidates.keys()))))
		
		return candidates
Example #16
0
    def _batch_import(base_class, cls, elements, fn):
        logging.debug('Trying to import {1} from {0} elements'.format(len(elements), cls))
        internal_ids = set(pluck('id_str', fn(elements)))

        existing_users = cls.objects.filter(internal_id__in=internal_ids)
        existing_ids = set([u.internal_id for u in existing_users])
        user_pks = dict([(u.internal_id, u.pk) for u in existing_users])
        new_ids = internal_ids - existing_ids

        logging.debug('Existing IDs: {0}'.format(len(existing_ids)))
        logging.debug('New IDs: {0}'.format(len(new_ids)))

        added_keys = set()
        new_elements = []
        for element in fn(elements):
            if element['id_str'] in user_pks:
                element['__pk__'] = user_pks[element['id_str']]
                element['__created__'] = False
            else:
                if not element['id_str'] in added_keys:
                    user_model = cls()
                    user_model.copy_json(valfilter(lambda x: x, element))
                    new_elements.append(user_model)
                element['__created__'] = True
                element['__pk__'] = None
                added_keys.add(element['id_str'])

        cls.objects.bulk_create(new_elements)

        new_models = list(cls.objects.filter(internal_id__in=new_ids))
        logging.debug('New IDs created successfully: {0}'.format(len(new_models)))
        new_pks = dict([(u.internal_id, u.pk) for u in new_models])
        for element in fn(elements):
            if element['id_str'] in new_pks:
                element['__pk__'] = new_pks[element['id_str']]

        return new_models
Example #17
0
def shogun_bt2_lca(input, output, bt2_indx, extract_ncbi_tid, depth, threads, annotate_lineage, run_lca):
    verify_make_dir(output)

    basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')]

    for basename in basenames:
        fna_inf = os.path.join(input, basename + '.fna')
        sam_outf = os.path.join(output, basename + '.sam')
        if os.path.isfile(sam_outf):
            print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf)
        else:
            print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads))

    if run_lca:
        tree = NCBITree()
        rank_name = list(tree.lineage_ranks.keys())[depth-1]
        if not rank_name:
            raise ValueError('Depth must be between 0 and 7, it was %d' % depth)

        begin, end = extract_ncbi_tid.split(',')

        counts = []
        for basename in basenames:
            sam_file = os.path.join(output, basename + '.sam')

            lca_map = build_lca_map(sam_file, lambda x: int(find_between(x, begin, end)), tree)

            if annotate_lineage:
                lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            else:
                lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            counts.append(taxon_counts)

        df = pd.DataFrame(counts, index=basenames)
        df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
Example #18
0
def test_sig_at_beginning():
    """ Test that the function signature is at the beginning of the docstring
        and is followed by exactly one blank line.
    """
    cytoolz_dict = valfilter(isfrommod('cytoolz'), cytoolz.__dict__)
    cytoolz_dict = keyfilter(lambda x: x not in skip_sigs, cytoolz_dict)

    for key, val in cytoolz_dict.items():
        doclines = val.__doc__.splitlines()
        assert len(doclines) > 2, (
            'cytoolz.%s docstring too short:\n\n%s' % (key, val.__doc__))

        sig = '%s(' % aliases.get(key, key)
        assert sig in doclines[0], (
            'cytoolz.%s docstring missing signature at beginning:\n\n%s'
            % (key, val.__doc__))

        assert not doclines[1], (
            'cytoolz.%s docstring missing blank line after signature:\n\n%s'
            % (key, val.__doc__))

        assert doclines[2], (
            'cytoolz.%s docstring too many blank lines after signature:\n\n%s'
            % (key, val.__doc__))
Example #19
0
def test_sig_at_beginning():
    """ Test that the function signature is at the beginning of the docstring
        and is followed by exactly one blank line.
    """
    cytoolz_dict = valfilter(isfrommod('cytoolz'), cytoolz.__dict__)
    cytoolz_dict = keyfilter(lambda x: x not in skip_sigs, cytoolz_dict)

    for key, val in cytoolz_dict.items():
        doclines = val.__doc__.splitlines()
        assert len(doclines) > 2, (
            'cytoolz.%s docstring too short:\n\n%s' % (key, val.__doc__))

        sig = '%s(' % aliases.get(key, key)
        assert sig in doclines[0], (
            'cytoolz.%s docstring missing signature at beginning:\n\n%s'
            % (key, val.__doc__))

        assert not doclines[1], (
            'cytoolz.%s docstring missing blank line after signature:\n\n%s'
            % (key, val.__doc__))

        assert doclines[2], (
            'cytoolz.%s docstring too many blank lines after signature:\n\n%s'
            % (key, val.__doc__))
def shogun_functional(input, output, bt2_indx, extract_ncbi_tid, threads):
    verify_make_dir(output)

    basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')]

    # Create a SAM file for each input FASTA file
    for basename in basenames:
        fna_inf = os.path.join(input, basename + '.fna')
        sam_outf = os.path.join(output, basename + '.sam')
        if os.path.isfile(sam_outf):
            print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf)
        else:
            print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads))

    img_map = IMGMap()

    for basename in basenames:
        sam_inf = os.path.join(output, basename + '.sam')
        step_outf = 'test'
        if os.path.isfile(step_outf):
            print("Found the \"%s.kegg.csv\". Skipping the LCA phase for this file." % step_outf)
        else:
            lca_map = build_img_ncbi_map(yield_alignments_from_sam_inf(sam_inf), )

    sam_files = [os.path.join(args.input, filename) for filename in os.listdir(args.input) if filename.endswith('.sam')]

    img_map = IMGMap()

    ncbi_tree = NCBITree()
    lca = LCA(ncbi_tree, args.depth)

    with open(args.output, 'w') if args.output else sys.stdout as outf:
        csv_outf = csv.writer(outf, quoting=csv.QUOTE_ALL, lineterminator='\n')
        csv_outf.writerow(['sample_id', 'sequence_id', 'ncbi_tid', 'img_id'])
        for file in sam_files:
            with open(file) as inf:
                lca_map = build_lca_map(yield_alignments_from_sam_inf(inf), lca, img_map)
                for key in lca_map:
                    img_ids, ncbi_tid = lca_map[key]
                    csv_outf.writerow([os.path.basename(file).split('.')[0], key, ncbi_tid, ','.join(img_ids)])

    if run_lca:
        tree = NCBITree()
        rank_name = list(tree.lineage_ranks.keys())[depth - 1]
        if not rank_name:
            raise ValueError('Depth must be between 0 and 7, it was %d' % depth)

        begin, end = extract_ncbi_tid.split(',')

        counts = []
        for basename in basenames:
            sam_file = os.path.join(output, basename + '.sam')
            lca_map = {}
            for qname, rname in yield_alignments_from_sam_inf(sam_file):
                ncbi_tid = int(find_between(rname, begin, end))
                if qname in lca_map:
                    current_ncbi_tid = lca_map[qname]
                    if current_ncbi_tid:
                        if current_ncbi_tid != ncbi_tid:
                            lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid)
                else:
                    lca_map[qname] = ncbi_tid

            if annotate_lineage:
                lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            else:
                lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            counts.append(taxon_counts)

        df = pd.DataFrame(counts, index=basenames)
        df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
Example #21
0
 def valfilter(self, predicate):
     return fdict(cytoolz.valfilter(predicate, self))
Example #22
0
    def plot_community_wedges(
        self,
        ax,
        level=1,
        wedge_width=0.5,
        wedge_ratio=None,
        wedge_offset=0.05,
        alpha=1.0,
        fill_gaps=False,
        palette="plasma",
        label_func=None,
    ):

        if wedge_ratio is None:
            wedge_ratio = self.node_ratio + wedge_offset

        community_ids = sorted(set(self.membership_per_level[level].values()))
        community_colors = dict(
            zip(community_ids,
                sns.color_palette(palette, n_colors=len(community_ids))))

        wedge_meta = []
        wedge_gap = 180 / self.network.num_vertices() if fill_gaps else 0

        # fom https://matplotlib.org/stable/gallery/pie_and_polar_charts/pie_and_donut_labels.html
        bbox_props = dict(boxstyle="square,pad=0.3", fc="none", ec="none")
        kw = dict(
            arrowprops=dict(arrowstyle="-", color="#abacab"),
            bbox=bbox_props,
            zorder=0,
            va="center",
            fontsize=8,
        )

        for c_id in community_ids:

            nodes_in_community = list(
                valfilter(lambda x: x == c_id,
                          self.membership_per_level[level]).keys())

            community_angles = [
                self.node_angles_dict[n_id] for n_id in nodes_in_community
            ]
            community_angles = [
                a if a >= 0 else a + 360 for a in community_angles
            ]
            community_angle = self.node_angles_dict[int(c_id)]

            if community_angle < 0:
                community_angle += 360

            min_angle = min(community_angles)
            max_angle = max(community_angles)

            extent_angle = max_angle - min_angle

            if extent_angle < 0:
                min_angle, max_angle = max_angle, min_angle

            if fill_gaps:
                min_angle -= wedge_gap
                max_angle += wedge_gap

            wedge_meta.append({
                "community_id": c_id,
                "n_nodes": len(nodes_in_community),
                "center_angle": community_angle,
                "extent_angle": extent_angle,
                "min_angle": min_angle,
                "max_angle": max_angle,
                "color": community_colors[c_id],
            })

            if label_func is not None:
                community_label = label_func(c_id)
                if community_label:
                    ratio = wedge_ratio + wedge_width

                    mid_angle = 0.5 * (max_angle + min_angle)
                    mid_angle_radians = np.radians(mid_angle)

                    pos_x, pos_y = ratio * np.cos(
                        mid_angle_radians), ratio * np.sin(mid_angle_radians)

                    horizontalalignment = {
                        -1: "right",
                        1: "left"
                    }[int(np.sign(pos_x))]
                    connectionstyle = "angle,angleA=0,angleB={}".format(
                        mid_angle)
                    kw["arrowprops"].update(
                        {"connectionstyle": connectionstyle})
                    ax.annotate(
                        community_label,
                        xy=(pos_x, pos_y),
                        xytext=(1.35 * pos_x, 1.4 * pos_y),
                        horizontalalignment=horizontalalignment,
                        **kw,
                    )

        collection = [
            Wedge(
                0.0,
                wedge_ratio + wedge_width,
                w["min_angle"],
                w["max_angle"],
                width=wedge_width,
            ) for w in wedge_meta
        ]
        ax.add_collection(
            PatchCollection(
                collection,
                edgecolor="none",
                color=[w["color"] for w in wedge_meta],
                alpha=alpha,
            ))

        return wedge_meta, collection
Example #23
0
    def plot_community_wedges(self,
                              ax,
                              level=None,
                              wedge_width=0.5,
                              wedge_ratio=None,
                              wedge_offset=0.05,
                              alpha=1.0,
                              fill_gaps=False,
                              palette='plasma'):
        self.check_status()

        if wedge_ratio is None:
            wedge_ratio = self.node_ratio + wedge_offset

        if level is None:
            level = self.community_level

        community_ids = set(self.membership_per_level[level].values())
        community_colors = sns.color_palette(palette,
                                             n_colors=len(community_ids))

        wedge_meta = []
        wedge_gap = 180 / self.network.num_vertices() if fill_gaps else 0

        for c_id in community_ids:

            nodes_in_community = list(
                valfilter(lambda x: x == c_id,
                          self.membership_per_level[level]).keys())

            community_angles = [
                self.node_angles_dict[n_id] for n_id in nodes_in_community
            ]
            community_angles = [
                a if a >= 0 else a + 360 for a in community_angles
            ]
            community_angle = self.node_angles[int(c_id)]

            if community_angle < 0:
                community_angle += 360

            min_angle = min(community_angles)
            max_angle = max(community_angles)

            extent_angle = max_angle - min_angle

            if extent_angle < 0:
                min_angle, max_angle = max_angle, min_angle

            if fill_gaps:
                min_angle -= wedge_gap
                max_angle += wedge_gap

            wedge_meta.append({
                'n_nodes': len(nodes_in_community),
                'center_angle': community_angle,
                'extent_angle': extent_angle,
                'min_angle': min_angle,
                'max_angle': max_angle,
                'color': community_colors[c_id]
            })

        collection = [
            Wedge(0.0,
                  wedge_ratio + wedge_width,
                  w['min_angle'],
                  w['max_angle'],
                  width=wedge_width) for w in wedge_meta
        ]
        ax.add_collection(
            PatchCollection(collection,
                            edgecolor='none',
                            color=[w['color'] for w in wedge_meta],
                            alpha=alpha))

        return wedge_meta, collection
Example #24
0
def test_curried_namespace():
    namespace = {}

    def should_curry(func):
        if not callable(func) or isinstance(func, curry):
            return False
        nargs = num_required_args(func)
        if nargs is None or nargs > 1:
            return True
        else:
            return nargs == 1 and has_keywords(func)


    def curry_namespace(ns):
        return dict(
            (
                name,
                curry(f) if should_curry(f) else f,
            )
            for name, f in ns.items()
            if '__' not in name
        )

    all_auto_curried = curry_namespace(vars(eth_utils))

    inferred_namespace = valfilter(callable, all_auto_curried)
    curried_namespace = valfilter(callable, eth_utils.curried.__dict__)

    if inferred_namespace != curried_namespace:
        missing = set(inferred_namespace) - set(curried_namespace)
        if missing:
            to_insert = sorted("%s," % f for f in missing)
            raise AssertionError(
                'There are missing functions in eth_utils.curried:\n'
                + '\n'.join(to_insert)
            )
        extra = set(curried_namespace) - set(inferred_namespace)
        if extra:
            raise AssertionError(
                'There are extra functions in eth_utils.curried:\n'
                + '\n'.join(sorted(extra))
            )
        unequal = merge_with(list, inferred_namespace, curried_namespace)
        unequal = valfilter(lambda x: x[0] != x[1], unequal)
        to_curry = keyfilter(lambda x: should_curry(getattr(eth_utils, x)), unequal)
        if to_curry:
            to_curry_formatted = sorted('{0} = curry({0})'.format(f) for f in to_curry)
            raise AssertionError(
                'There are missing functions to curry in eth_utils.curried:\n'
                + '\n'.join(to_curry_formatted)
            )
        elif unequal:
            not_to_curry_formatted = sorted(unequal)
            raise AssertionError(
                'Missing functions NOT to curry in eth_utils.curried:\n'
                + '\n'.join(not_to_curry_formatted)
            )
        else:
            raise AssertionError("unexplained difference between %r and %r" % (
                inferred_namespace,
                curried_namespace,
            ))
Example #25
0
    def named_clause_groups(self)->dict:
        """Dict of subgroup of clauses contained directly in this group with name as key.

        :rtype: Dict[str, ClauseGroup]
        """
        return valfilter(lambda md: isinstance(md, ClauseGroup), self.named_children())
Example #26
0
def get_students():
    load_data()
    return {k:v.json for k,v in
            t.valfilter(lambda v: not v.deleted, data['students']).iteritems()}
Example #27
0
	def process_ngrams(self, filename, Encoder, save = False):

		print("\t\tStarting " + filename)
		#Initialize bigram dictionary
		ngrams = defaultdict(int)
		unigrams = defaultdict(int)
				
		starting = time.time()
		total = 0

		for line in Encoder.load_stream(filename):

			total += len(line)

			#Store unigrams
			for item in line:
				unigrams[(1, item[0])] += 1
				unigrams[(2, item[1])] += 1
				unigrams[(3, item[2])] += 1
			
			try:
				for bigram in ct.sliding_window(2, line):
					
					#Tuples are indexes for (LEX, POS, CAT)
					#Index types are 1 (LEX), 2 (POS), 3 (CAT)
					ngrams[((1, bigram[0][0]), (1, bigram[1][0]))] += 1	#lex_lex
					ngrams[((1, bigram[0][0]), (2, bigram[1][1]))] += 1	#lex_pos
					ngrams[((1, bigram[0][0]), (3, bigram[1][2]))] += 1	#lex_cat
					ngrams[((2, bigram[0][1]), (2, bigram[1][1]))] += 1	#pos_pos
					ngrams[((2, bigram[0][1]), (1, bigram[1][0]))] += 1	#pos_lex
					ngrams[((2, bigram[0][1]), (3, bigram[1][2]))] += 1	#pos_cat 
					ngrams[((3, bigram[0][2]), (3, bigram[1][2]))] += 1	#cat_cat
					ngrams[((3, bigram[0][2]), (2, bigram[1][1]))] += 1	#cat_pos
					ngrams[((3, bigram[0][2]), (1, bigram[1][0]))] += 1	#cat_lex
			
			#Catch errors from empty lines coming out of the encoder
			except Exception as e:
				error = e

		#Reduce nonce ngrams
		size = len(list(ngrams.keys()))
		keepable = lambda x: x > 1
		ngrams = ct.valfilter(keepable, ngrams)
		
		#Note: Keep all unigrams, they are already limited by the lexicon
		
		#Reduce null indexes
		ngrams = {key: ngrams[key] for key in list(ngrams.keys()) if 0 not in key[0] and 0 not in key[1]}
		unigrams = {key: unigrams[key] for key in list(unigrams.keys()) if 0 not in key}
		
		ngrams = ct.merge([ngrams, unigrams])	
		ngrams["TOTAL"] = total
		
		del unigrams
		
		#Print status
		print("\tTime: ", end = "")
		print(time.time() - starting, end = "")
		print(" Full: " + str(size) + " ", end = "")
		print(" Reduced: ", end = "")
		print(len(list(ngrams.keys())), end = "")
		print(" with " + str(ngrams["TOTAL"]) + " words.")
		
		if save == True:
			self.Loader.save_file(ngrams, filename + ".ngrams.p")
			return os.path.join(self.Loader.output_dir, filename + ".ngrams.p")
				
		else:
			return ngrams
Example #28
0
def get_users():
    return {k:v.json for k,v in
            t.valfilter(lambda v: not v.deleted, data['users']).iteritems()}