def flatten(nested_list): """ Return a flattened list of items from a nested list. """ lst = [] for item in nested_list: if isinstance(item, (list, tuple)): lst.extend(flatten(item)) else: lst.append(item) return lst # Store names that have been previously assigned. name_heap = set([None]) prefix_counts = collections.Counter() def reset_get_unique_name(): """Reset the heaps that store previously-assigned names.""" global name_heap, prefix_counts name_heap = set([None]) prefix_counts = collections.Counter() def get_unique_name(lst, attrib, prefix, initial=None): """ Return a name that doesn't collide with another in a list. This subroutine is used to generate unique part references (e.g., "R12") or unique net names (e.g., "N$5").
import collections print(collections.Counter(''.join(input().split())).most_common(1)[0][1])
def solution(participant, completion): answer = collections.Counter(participant) - collections.Counter(completion) return list(answer.keys())[0]
aspect=4) # Assumption: the less people was in your family the faster you were to get to the boat. The more people they are the more managment is required. However, if you had no family members you might wanted to help others and therefore sacrifice. # # * The females traveling with up to 2 more family members had a higher chance to survive. However, a high variation of survival rate appears once family size exceeds 4 as mothers/daughters would search longer for the members and therefore the chanes for survival decrease. # * Alone men might want to sacrifice and help other people to survive. # ## 1.5 Survival rate by the title # * Barplots show that roalties had normally 1st or 2nd class tickets. However, people with the title Master had mostly 3rd class. In fact, a title 'Master' was given to unmarried boys. You can see that the age of of people with this title is less than 13. # * Women and roalties had higher survival rate. (There are only two titlted women in the train class and both have survived, I would put them into Mrs class) # * The civils and reverends a lower one due to the fact that they had/wanted to help people. # In[ ]: train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False) print(collections.Counter(train['Title']).most_common()) test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False) print() print(collections.Counter(test['Title']).most_common()) # In[ ]: tab = pd.crosstab(train['Title'], train['Pclass']) print(tab) tab_prop = tab.div(tab.sum(1).astype(float), axis=0) tab_prop.plot(kind="bar", stacked=True) # Investigate who were masters. The age is less than 12. # In[ ]:
def extract_expression(tumor, platform, gencode_version): """ The EXTRACT_EXPRESSION operation extracts expression values from TCGA for all the genes of interest and their candidate regulatory genes. Intermediate results files are exported locally during the execution of the function, while the final dataframes are returned as Pandas dataframes and exported locally in the Excel files 'Gene Expression - InterestGenes.xlsx' and 'Gene Expression - RegulatoryGenes.xlsx'. :param tumor: full name of the tumor of interest, encoded as a string (e.g. 'Ovarian Serous Cystadenocarcinoma', 'Breast Invasive Carcinoma', ...) :param platform: number identifying the sequencing platform (either 27 for the 27k probes sequencing platform or 450 for the 450k probes sequencing platform) :param gencode_version: number representing the GENCODE genomic annotations to use (currently, for assembly GRCh38, versions 22, 24 and 27 can be used) :return: two Pandas dataframes Example:: import genereg as gr expr_interest_df, expr_regul_df = gr.GeneExpression.extract_expression(tumor='Ovarian Serous Cystadenocarcinoma', platform=27, gencode_version=22) """ # Check input parameters tcga_tumors = ["Acute Myeloid Leukemia","Adrenocortical Carcinoma","Bladder Urothelial Carcinoma","Brain Lower Grade Glioma" ,"Breast Invasive Carcinoma","Cervical Squamous Cell Carcinoma and Endocervical Adenocarcinoma","Cholangiocarcinoma","Colon Adenocarcinoma","Esophageal Carcinoma","Glioblastoma Multiforme","Head and Neck Squamous Cell Carcinoma","Kidney Chromophobe","Kidney Renal Clear Cell Carcinoma","Kidney Renal Papillary Cell Carcinoma","Liver Hepatocellular Carcinoma","Lung Adenocarcinoma","Lung Squamous Cell Carcinoma","Lymphoid Neoplasm Diffuse Large B-cell Lymphoma","Mesothelioma","Ovarian Serous Cystadenocarcinoma","Pancreatic Adenocarcinoma","Pheochromocytoma and Paraganglioma","Prostate Adenocarcinoma","Rectum Adenocarcinoma","Sarcoma","Skin Cutaneous Melanoma","Stomach Adenocarcinoma","Testicular Germ Cell Tumors","Thymoma","Thyroid Carcinoma","Uterine Carcinosarcoma","Uterine Corpus Endometrial Carcinoma","Uveal Melanoma"] if tumor not in tcga_tumors: raise ValueError('PATHOLOGY NOT SUPPORTED! You can analyze one of these 33 types of TCGA tumors: '+(', '.join(tcga_tumors))) if platform not in [27, 450]: raise ValueError('PLATFORM NOT RECOGNIZED! Sequencing platforms available: 27 and 450') if gencode_version not in [22, 24, 27]: raise ValueError('GRCh38 GENCODE versions available are 22, 24 and 27') # Load the list of genes of interest EntrezConversion_df = pd.read_excel('./Genes_of_Interest.xlsx',sheetname='Sheet1',header=0,converters={'GENE_SYMBOL':str,'ENTREZ_GENE_ID':str,'GENE_SET':str}) # Create a list containing the Gene Symbols of the genes of interest genesSYM_of_interest = [] for i, r in EntrezConversion_df.iterrows(): sym = r['GENE_SYMBOL'] if sym not in genesSYM_of_interest: genesSYM_of_interest.append(sym) # Import the dictionary of genes of interest with their candidate regulatory genes dict_RegulGenes = pickle.load(open('./2_Regulatory_Genes/dict_RegulGenes.p', 'rb')) # Import the gene-TFs mapping dataframe Mapping_df = pd.read_excel('./0_Genes_Mapping/Genes Mapping.xlsx',sheetname='Sheet1',header=0,converters={'ENTREZ_GENE_ID':str,'HGNC_ID':str}) # Create a list containing the Gene Symbols of the regulatory genes of genes of interest regulatory_genesSYM = [] for key, value in dict_RegulGenes.items(): for gene in value: if gene not in regulatory_genesSYM: regulatory_genesSYM.append(gene) # Extract the list of distinct Gene Symbols mapped in the mapping table mapped_gene_SYMs = [] for index, row in Mapping_df.iterrows(): sym = row['GENE_SYMBOL'] if sym not in mapped_gene_SYMs: mapped_gene_SYMs.append(sym) # Execute the query for the extraction of gene expression values on the remote server, using the PyGMQL Python library gl.set_remote_address('http://gmql.eu/gmql-rest/') gl.login() gl.set_mode('remote') # Load the TCGA datasets to be used in the query methylation_dataset = gl.load_from_remote(remote_name='GRCh38_TCGA_methylation', owner='public') expression_dataset = gl.load_from_remote(remote_name='GRCh38_TCGA_gene_expression', owner='public') # Identify the sequencing platform to be used if platform == 27: seq_platform = 'Illumina Human Methylation 27' elif platform == 450: seq_platform = 'Illumina Human Methylation 450' # Extract all the samples for the current tumor and platform all_methyl = methylation_dataset.meta_select((methylation_dataset['manually_curated__cases__disease_type'] == tumor) & (methylation_dataset['manually_curated__platform'] == seq_platform) & ((methylation_dataset['biospecimen__bio__sample_type'] == 'Primary Tumor') | (methylation_dataset['biospecimen__bio__sample_type'] == 'Recurrent Tumor')) & (methylation_dataset['clinical__shared__history_of_neoadjuvant_treatment'] == 'No')) all_expr = expression_dataset.meta_select((expression_dataset['manually_curated__cases__disease_type'] == tumor) & ((expression_dataset['biospecimen__bio__sample_type'] == 'Primary Tumor') | (expression_dataset['biospecimen__bio__sample_type'] == 'Recurrent Tumor')) & (expression_dataset['clinical__shared__history_of_neoadjuvant_treatment'] == 'No')) # Gene Expression: expr_0 = all_expr.reg_project(field_list=['ensembl_gene_id','entrez_gene_id','gene_symbol','fpkm']) expr = expr_0.meta_select(semiJoinDataset=all_methyl, semiJoinMeta=['biospecimen__bio__bcr_sample_barcode']) # Materialize the results into a GDataframe expr_Gdf = expr.materialize('./(MaterializeResults)') # The result dataset is loaded as a GDataframe, an object containing two pandas dataframes, one for the region data and one for the metadata. # Get the two pandas dataframes: expr_df_regs = expr_Gdf.regs expr_df_meta = expr_Gdf.meta n_regs = len(expr_df_regs) n_samples = len(expr_df_meta) # Rename 'chr', 'start', and 'stop' columns header expr_df_regs.rename(columns={'chr':'chrom','start':'left','stop':'right'}, inplace=True) # Change index into progressive integer numbers and store the name of the sample in another column expr_df_regs['sample_id'] = expr_df_regs.index expr_df_regs.index = range(n_regs) # Convert unknown values (NaN) to empty strings expr_df_regs = expr_df_regs.fillna('') # Convert all the metadata values into strings, since they're encode as lists in Python col_names = [] for name, values in expr_df_meta.iteritems(): col_names.append(name) for index, row in expr_df_meta.iterrows(): for c in col_names: list_val = row[c] # it's encoded as a list str_val = ''.join(list_val) # convert the value stored as a list in a string expr_df_meta.set_value(index,c,str_val) # Since we have to extract the expression values for each distinct sample barcode (aliquot), we create a list containing these distinct identifiers expr_sample_barcodes_all = [] for index, row in expr_df_meta.iterrows(): barcode = row['biospecimen__bio__bcr_sample_barcode'] if barcode not in expr_sample_barcodes_all: # get distinct values expr_sample_barcodes_all.append(barcode) # Check which are repeated aliquots, if present all_aliqouts = [] for index, row in expr_df_meta.iterrows(): barcode = row['biospecimen__bio__bcr_sample_barcode'] all_aliqouts.append(barcode) multiple_aliquots = [item for item, count in collections.Counter(all_aliqouts).items() if count > 1] samples_to_remove = [] expr_sample_barcodes = [] if len(multiple_aliquots) != 0: # Among the repeated aliquots, keep only the most recent ones (of 2013) for index, row in expr_df_meta.iterrows(): year = row['biospecimen__bio__year_of_shipment'] barcode = row['biospecimen__bio__bcr_sample_barcode'] if (barcode in multiple_aliquots) and year == '2011': expr_df_meta.drop(index, inplace=True) samples_to_remove.append(index) # Import the list of aliquots in the methylation dataset text_file = open(common_aliquots, 'r') aliquots = text_file.read().split('\n') aliquots.remove('') text_file.close() # Extract the new list of distinct TCGA Aliquots to extract for index, row in expr_df_meta.iterrows(): barcode = row['biospecimen__bio__bcr_sample_barcode'] if barcode in aliquots: if barcode not in expr_sample_barcodes: expr_sample_barcodes.append(barcode) else: expr_df_meta.drop(index, inplace=True) samples_to_remove.append(index) # Remove regions that corresponded to eliminated repeated aliquots expr_df_regs = expr_df_regs.loc[~(expr_df_regs['sample_id'].isin(samples_to_remove))].copy() else: expr_sample_barcodes = expr_sample_barcodes_all # Export the metadata dataframe setting the TCGA aliquots as indexes. Metadata_df = expr_df_meta.copy() Metadata_df['id_sample'] = Metadata_df.index Metadata_df.set_index('biospecimen__bio__bcr_sample_barcode', inplace=True) writer = ExcelWriter('./3_TCGA_Data/Gene_Expression/EXPR (Metadata).xlsx') Metadata_df.to_excel(writer,'Sheet1') writer.save() # Extract from the expression dataset all the regions that belong to genes of interest expr_df_regs_interest = expr_df_regs.loc[expr_df_regs['gene_symbol'].isin(genesSYM_of_interest)].copy() # Extract from the expression dataset all the regions that belong to regulatory genes of genes of interest expr_df_regs_regulatory = expr_df_regs.loc[expr_df_regs['gene_symbol'].isin(regulatory_genesSYM)].copy() # Gene expression values for each gene of interest: # Create a dictionary for storing all the gene expression values for each gene of interest and for each aliquot TCGA from collections import defaultdict dict_expr_interest = defaultdict(dict) for key, value in dict_expr_interest.items(): value = defaultdict(list) # The main dictionary has the Gene Symbols of the genes of interest as keys and each gene has another dictionary as value, which, in turn, has the different aliquots as keys and lists as values. # The idea is having a list, containing all the fpkm values, for each gene in each TCGA aliquot. # Set the Gene Symbol as keys of the main dictionary for name in genesSYM_of_interest: dict_expr_interest[name] = {} # Set the names of the samples barcodes as keys for each dictionary set as value of a specific key (genes) for sample in expr_sample_barcodes: for k, v in dict_expr_interest.items(): v[sample] = [] # Set the values by appending the expression values for each gene of interest: these expression values (fpkm) can be found in the 'expr_df_regs_interest' dataframe for index, row in expr_df_regs_interest.iterrows(): # iterating along the whole dataframe sym = row['gene_symbol'] # get the Gene Symbol of the gene fpkm = row['fpkm'] # get the gene expression value sample = row['sample_id'] # get the name of the sample # get the aliquot corresponding to current sample aliq = expr_df_meta.get_value(sample, 'biospecimen__bio__bcr_sample_barcode') # add the value according to the correct gene ID and TCGA aliquot, rounding it to a float with maximum 6 decimal numbers, dict_expr_interest[sym][aliq].append(round(float(fpkm),6)) # Convert the nested dictionary also into a dataframe # Create a dataframe whose row indexes are the different TCGA samples and the columns are the distinct genes of interest expr_interest_df1 = pd.DataFrame(index = expr_sample_barcodes, columns = [genesSYM_of_interest]) # Add three additional columns for the name of the sample and the ID and barcode of the patient corresponding to each aliquot, in order to have them available if we will need it expr_interest_df2 = pd.DataFrame(index = expr_sample_barcodes, columns = ['Sample_ID','Tumor','Patient_ID']) # Create the final dataframe expr_interest_df = expr_interest_df1.join(expr_interest_df2) # Fill the previously created dataframe with the correct gene expression values, for each gene of interest and for each TCGA aliquot for gene_sym, dict_value in dict_expr_interest.items(): for tcga_aliq, exp_list in dict_value.items(): if (len(exp_list) != 0): fpkm = exp_list[0] # add the expression value in the proper cell of the dataframe, rounding it to a float with maximum 6 decimal numbers expr_interest_df.set_value(tcga_aliq,gene_sym,round(fpkm,6)) # Add to the dataframe the name of each sample, the tumor code and the patient's ID in correspondence of each TCGA aliquot for index, row in expr_df_meta.iterrows(): aliquot = row['biospecimen__bio__bcr_sample_barcode'] tumor_tag = row['clinical__admin__disease_code'] patient_id = row['clinical__shared__patient_id'] expr_interest_df.set_value(aliquot,'Sample_ID',index) expr_interest_df.set_value(aliquot,'Tumor',tumor_tag) expr_interest_df.set_value(aliquot,'Patient_ID',patient_id) # Add a row at the beginning of the dataframe to insert also the Entrez Gene ID of each gene of interest additional_index = ['ENTREZ_GENE_ID'] expr_interest_df0_1 = pd.DataFrame(index = additional_index, columns = [genesSYM_of_interest]) expr_interest_df0_2 = pd.DataFrame(index = additional_index, columns = ['Sample_ID','Tumor','Patient_ID']) expr_interest_df0 = expr_interest_df0_1.join(expr_interest_df0_2) frames = [expr_interest_df0, expr_interest_df] expr_interest_df = pd.concat(frames) # Add for each Gene Symbol of our genes of interest the corresponding Entrez Gene ID in the first row of the dataframe for i, r in EntrezConversion_df.iterrows(): entrez_id = r['ENTREZ_GENE_ID'] gene_name = r['GENE_SYMBOL'] expr_interest_df.set_value('ENTREZ_GENE_ID',gene_name,entrez_id) # Set empty strings for NaN values in the 'GENE_SYMBOL' row expr_interest_df.set_value('ENTREZ_GENE_ID','Sample_ID',"") expr_interest_df.set_value('ENTREZ_GENE_ID','Tumor',"") expr_interest_df.set_value('ENTREZ_GENE_ID','Patient_ID',"") # Export the dataframe with the gene expression values for our genes of interest for each TCGA aliquot writer = ExcelWriter('./3_TCGA_Data/Gene_Expression/Gene Expression - InterestGenes.xlsx') expr_interest_df.to_excel(writer,'Sheet1') writer.save() # Gene expression values for each candidate regulatory gene of the genes of interest: # Create a dictionary for storing all the gene expression values for each gene of interest and for each aliquot TCGA from collections import defaultdict dict_expr_regulatory = defaultdict(dict) for key, value in dict_expr_regulatory.items(): value = defaultdict(list) # The main dictionary has the Gene Symbols of the candidate regulatory genes as keys and each gene has another dictionary as value, which, in turn, has the different aliquots as keys and lists as values. # The idea is having a list, containing all the fpkm values, for each gene in each TCGA aliquot. # Set the Gene Symbols as keys of the main dictionary for name in regulatory_genesSYM: dict_expr_regulatory[name] = {} # Set the names of the samples barcodes as keys for each dictionary set as value of a specific key (genes) for sample in expr_sample_barcodes: for k, v in dict_expr_regulatory.items(): v[sample] = [] # Set the values by appending the expression values for each candidate regulatory gene: these expression values (fpkm) can be found in the "expr_df_regs_regulatory" dataframe for index, row in expr_df_regs_regulatory.iterrows(): # iterating along the whole dataframe sym = row['gene_symbol'] # get the Gene Symbol of the gene ens_id = row['ensembl_gene_id'] # get the Ensembl Gene ID fpkm = row['fpkm'] # get the gene expression value sample = row['sample_id'] # get the name of the sample # get the aliquot corresponding to current sample aliq = expr_df_meta.get_value(sample, 'biospecimen__bio__bcr_sample_barcode') # add the value according to the correct gene ID and TCGA aliquot, rounding it to a float with maximum 6 decimal numbers if (gencode_version == 22): if (ens_id not in ['ENSG00000277726.3','ENSG00000275895.3','ENSGR0000214717.8']): dict_expr_regulatory[sym][aliq].append(round(float(fpkm),6)) else: dict_expr_regulatory[sym][aliq].append(round(float(fpkm),6)) # Convert the nested dictionary also into a dataframe # Create a dataframe whose row indexes are the different TCGA samples and the columns are the distinct candidate regulatory genes expr_regulatory_df1 = pd.DataFrame(index = expr_sample_barcodes, columns = [regulatory_genesSYM]) # Add three additional columns for the name of the sample and the ID and barcode of the patient corresponding to each aliquot, in order to have them available if we will need it expr_regulatory_df2 = pd.DataFrame(index = expr_sample_barcodes, columns = ['Sample_ID','Tumor','Patient_ID']) # Create the final dataframe expr_regulatory_df = expr_regulatory_df1.join(expr_regulatory_df2) # Fill the previously created dataframe with the correct gene expression values, for each candidate regulatory gene and for each TCGA aliquot for gene_sym, dict_value in dict_expr_regulatory.items(): for tcga_aliq, exp_list in dict_value.items(): if (len(exp_list) != 0): fpkm = exp_list[0] # add the expression value in the proper cell of the dataframe, rounding it to a float with maximum 6 decimal numbers expr_regulatory_df.set_value(tcga_aliq,gene_sym,round(fpkm,6)) # Add to the dataframe the name of each sample, the tumor code and the patient's ID in correspondence of each TCGA aliquot for index, row in expr_df_meta.iterrows(): aliquot = row['biospecimen__bio__bcr_sample_barcode'] tumor_tag = row['clinical__admin__disease_code'] patient_id = row['clinical__shared__patient_id'] expr_regulatory_df.set_value(aliquot,'Sample_ID',index) expr_regulatory_df.set_value(aliquot,'Tumor',tumor_tag) expr_regulatory_df.set_value(aliquot,'Patient_ID',patient_id) # Add a row at the beginning of the dataframe to insert also the Gene Symbols of each gene of interest additional_index = ['ENTREZ_GENE_ID'] expr_regulatory_df0_1 = pd.DataFrame(index = additional_index, columns = [regulatory_genesSYM]) expr_regulatory_df0_2 = pd.DataFrame(index = additional_index, columns = ['Sample_ID','Tumor','Patient_ID']) expr_regulatory_df0 = expr_regulatory_df0_1.join(expr_regulatory_df0_2) frames = [expr_regulatory_df0, expr_regulatory_df] expr_regulatory_df = pd.concat(frames) # Add for each Gene Symbol of the regulatory genes the corresponding Entrez Gene ID in the first row of the dataframe for i in regulatory_genesSYM: if i == 'PTRF': entrez_id = Mapping_df.loc[Mapping_df['GENE_SYMBOL'] == 'CAVIN1', 'ENTREZ_GENE_ID'].iloc[0] else: entrez_id = Mapping_df.loc[Mapping_df['GENE_SYMBOL'] == i, 'ENTREZ_GENE_ID'].iloc[0] expr_regulatory_df.set_value('ENTREZ_GENE_ID',i,entrez_id) # Set empty strings for NaN values in the 'GENE_SYMBOL' row expr_regulatory_df.set_value('ENTREZ_GENE_ID','Sample_ID',"") expr_regulatory_df.set_value('ENTREZ_GENE_ID','Tumor',"") expr_regulatory_df.set_value('ENTREZ_GENE_ID','Patient_ID',"") # Export the dataframe with the gene expression values for the regulatory genes of our genes of interest for each TCGA aliquot writer = ExcelWriter('./3_TCGA_Data/Gene_Expression/Gene Expression - RegulatoryGenes.xlsx') expr_regulatory_df.to_excel(writer,'Sheet1') writer.save() return expr_interest_df, expr_regulatory_df
import pickle import re import string import sys import numpy as np from nltk import conlltags2tree, pos_tag, tree2conlltags, word_tokenize from nltk.stem.snowball import SnowballStemmer from sklearn.externals import joblib from sklearn.feature_extraction import DictVectorizer from sklearn.linear_model import Perceptron from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline ner_tags = collections.Counter() basepath = os.path.dirname(__file__) corpus_root = os.path.abspath(os.path.join(basepath, "gmb-2.2.0")) # corpus_root = "gmb-2.2.0.zip" # reload(sys) # sys.setdefaultencoding('utf-8') def read_gmb(corpus_root): for root, dirs, files in os.walk(corpus_root): for filename in files: if filename.endswith(".tags"): with open(os.path.join(root, filename), 'rb') as file_handle: # file_handle = zipfile.ZipFile('gmb-2.2.0.zip', 'r')
def compute_bleu(reference_corpus, translation_corpus, max_order=4, smooth=False): """Computes BLEU score of translated segments against one or more references. Args: reference_corpus: list of lists of references for each translation. Each reference should be tokenized into a list of tokens. translation_corpus: list of translations to score. Each translation should be tokenized into a list of tokens. max_order: Maximum n-gram order to use when computing BLEU score. smooth: Whether or not to apply Lin et al. 2004 smoothing. Returns: 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram precisions and brevity penalty. """ matches_by_order = [0] * max_order possible_matches_by_order = [0] * max_order reference_length = 0 translation_length = 0 for (references, translation) in zip(reference_corpus, translation_corpus): reference_length += min(len(r) for r in references) translation_length += len(translation) merged_ref_ngram_counts = collections.Counter() for reference in references: merged_ref_ngram_counts |= _get_ngrams(reference, max_order) translation_ngram_counts = _get_ngrams(translation, max_order) overlap = translation_ngram_counts & merged_ref_ngram_counts for ngram in overlap: matches_by_order[len(ngram)-1] += overlap[ngram] for order in range(1, max_order+1): possible_matches = len(translation) - order + 1 if possible_matches > 0: possible_matches_by_order[order-1] += possible_matches precisions = [0] * max_order for i in range(0, max_order): if smooth: precisions[i] = ((matches_by_order[i] + 1.) / (possible_matches_by_order[i] + 1.)) else: if possible_matches_by_order[i] > 0: precisions[i] = (float(matches_by_order[i]) / possible_matches_by_order[i]) else: precisions[i] = 0.0 if min(precisions) > 0: p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions) geo_mean = math.exp(p_log_sum) else: geo_mean = 0 ratio = float(translation_length) / reference_length if ratio > 1.0: bp = 1. else: if ratio > 1E-1: bp = math.exp(1 - 1. / ratio) else: bp = 1E-2 bleu = geo_mean * bp return (bleu, precisions, bp, ratio, translation_length, reference_length)
def get_vocab_imdb(data): tokenized_data = get_tokenized_imdb(data) counter = collections.Counter([tk for st in tokenized_data for tk in st]) return text.vocab.Vocabulary(counter, min_freq=5)
len_matter_within_trainig_data_list.append(len(elements_dict)) del vector_element tc_list = tc_list[index_to_use] pn_list = pn_list[index_to_use] super_conductor_family_index = super_conductor_family_index[index_to_use] exit() # organic_index=organic_index[index_to_use] if len(tc_list) != len(super_conductor_family_index): print('the length of two lists is not equal.') exit() print('the length of the list is ', len(tc_list)) os.chdir(save_dir) print('problemetic element', np.unique(problematic_elements)) print('frequency', collections.Counter(problematic_elements)) np.save('./made_data/super_conductor_family_index', super_conductor_family_index) print('super_conductor_family_index.shape', super_conductor_family_index.shape) # np.save('./made_data/organic_index', organic_index) print('the length of non_organic is', count_non_organic) # problematic_materials_list = pd.DataFrame( # {'material': problematic_materials, 'element': problematic_elements, 'Tc': problematic_tc}) # filepath = os.path.join(save_dir,'./made_data' ,'problematic_elements.csv') # problematic_materials_list.to_csv(filepath,index=False) matter_to_use_list = pd.DataFrame({ 'element': matter_name_to_use, 'Tc': matter_tc_to_use
def __init__(self, time_frame=1): self.time_frame = time_frame self.frame_q = collections.deque() self.id_counts = collections.Counter()
def __init__(self): self.observed_idcounts = collections.Counter() self.observed_system_entropy = 0
def sanitize_var_map(op_arguments, arguments, precision=None, device=None): ''' Sanitizes a dictionary of `Variable` s to input data such that it can be handed off to the evaluation methods (:meth:`~cntk.ops.functions.Function.forward`, :meth:`~cntk.ops.functions.Function.backward`, :meth:`~cntk.Trainer.train_minibatch` and :meth:`~cntk.Trainer.test_minibatch`). Args: op_arguments (:class:`~cntk.ops.functions.Function`): arguments of the root function. In :meth:`~cntk.ops.functions.Function.forward` pass it is typically `op.arguments`, in :meth:`~cntk.ops.functions.Function.backward` pass it is `op.outputs` arguments: maps variables to their input data. The interpretation depends on the input type: * dict: keys are input variable or names, and values are the input data. * any other type: if node has an unique input, arguments is mapped to this input. For nodes with more than one input, only dict is allowed. In both cases, every every sample in the data will be interpreted as a new sequence. Sequences can be marked as continuations of the same sequence in the previous minibatch (that is the sequence in the same slot). There are two possibilities for this: * specifying arguments as a `tuple` where the first element is used as arguments and the second one will be used as a list of bools, denoting whether a sequence is a new one (`True`) or a continuation of the sequence in the same slot of the previous minibatch (`False`). This will be applied to all batches. * specifying arguments as a dictionary of variables to tuples where the first element is used as arguments and the second one will be used as a list of bools, denoting whether a sequence is a new one (`True`) or a continuation of the sequence in the same slot of the previous minibatch (`False`). This will be applied to all batches. Data should be either NumPy arrays or a :class:`~cntk.io.MinibatchData` instance. precision (str or `np.float32` or `np.float64`): if string it can be one of 'float' 'float32, 'double', 'float64', or None device (:class:`~cntk.device.DeviceDescriptor`, default None): device this value should be put on Returns: `dict` that maps variables to sanitized batches ''' from ..io import MinibatchData if isinstance(arguments, tuple): arguments, seq_starts = arguments else: seq_starts = None if arguments is None or isinstance(arguments, (dict, list)) and len(arguments) == 0: if len(op_arguments) > 0: raise ValueError('function expects %i arguments' % len(op_arguments)) return {} if len(arguments) < len(op_arguments): raise ValueError('your graph has %i inputs, but you specified %i' % (len(op_arguments), len(arguments))) if isinstance(arguments, dict): arg_names = [var.name for var in op_arguments] name_counter = collections.Counter(arg_names) var_name_map = dict((var.name, var) for var in op_arguments) else: if len(op_arguments) == 1: name_counter = collections.Counter([op_arguments[0].name]) var_name_map = dict([(op_arguments[0].name, op_arguments[0])]) arguments = dict([(op_arguments[0], arguments)]) else: raise ValueError( 'non-dict argument (%s) is not supported for nodes with more than one input' % type(arguments).__name__) if precision is not None: precision = sanitize_precision(precision) var_map = {} for var, batch in arguments.items(): if isinstance(var, str): if name_counter[var] == 0: raise ValueError( 'variable with name "%s" does not exist in the network. Available variable names: %s' % (var, ", ".join(var_name_map))) elif name_counter[var] > 1: raise ValueError('node name "%s" is not unique' % var) try: var = var_name_map[var] except KeyError: raise KeyError( "no input with the name '%s' was found. Available: %s" % (var, ", ".join(var_name_map.keys()))) if isinstance(batch, tuple): if seq_starts is not None: raise ValueError( 'you cannot provide sequence start ' 'information globally and for individual batches ' 'at the same time') batch, seq_starts = batch if seq_starts is not None: if not isinstance(seq_starts, (tuple, list)): raise ValueError( 'if you specify sequence begin markers, it needs to be a list' ) sample_size = batch.shape[0] if hasattr( batch, 'shape') else len(batch) if len(seq_starts) != sample_size: raise ValueError('you have %i sequences, but only %i ' 'sequence begin markers' % (sample_sizes, len(seq_starts))) if isinstance(batch, MinibatchData): batch = batch.m_data elif not isinstance(batch, cntk_py.Value): batch = sanitize_batch(var, batch, seq_starts, device) var_map[var] = batch return var_map
def mode(data): """Return the most common data item. If there are ties, return any one of them.""" [(item, count)] = collections.Counter(data).most_common(1) return item
def reset_get_unique_name(): """Reset the heaps that store previously-assigned names.""" global name_heap, prefix_counts name_heap = set([None]) prefix_counts = collections.Counter()
import itertools import collections N = int(input()) A = list(map(int, input().split())) acum = list(itertools.accumulate(A, initial=0)) c = collections.Counter(acum) print(sum(n * (n - 1) // 2 for n in c.values())) # nC2
import sys import json import re import collections import os import datetime total_playlists = 0 total_tracks = 0 tracks = set() artists = set() albums = set() titles = set() total_descriptions = 0 ntitles = set() title_histogram = collections.Counter() artist_histogram = collections.Counter() track_histogram = collections.Counter() last_modified_histogram = collections.Counter() num_edits_histogram = collections.Counter() playlist_length_histogram = collections.Counter() num_followers_histogram = collections.Counter() quick = False max_files_for_quick_processing = 5 def process_mpd(path): count = 0 filenames = os.listdir(path) for filename in sorted(filenames):
occupationList.append(occupationList1[0]) #Output all fetched occupation in Output dd = df.ix[0:, 0:3] dd['Occupation'] = pd.Series(occupationList, index=df.index) writer = pd.ExcelWriter('output.xlsx') dd.to_excel(writer) writer.save() print("Final List") print(occupationList) #Print count of occupation counts = collections.Counter(occupationList) print(counts) from pytagcloud import create_tag_image, make_tags from pytagcloud.lang.counter import get_tag_counts j = "" for i in occupationList: if (i != 'no_occupation'): j = j + " " + i print(j) # Let's plot the result import wordcloud from wordcloud import WordCloud, ImageColorGenerator import matplotlib.pyplot as plt import numpy as np
#/usr/bin/env python import collections with open('input/day_3') as file: data = file.read() houses = collections.Counter() position_x = [0, 0] position_y = [0, 0] houses[(0, 0)] = 2 santa = 0 for direction in data.strip(): if direction == '^': position_y[santa] += 1 elif direction == 'v': position_y[santa] -= 1 elif direction == '>': position_x[santa] += 1 elif direction == '<': position_x[santa] -= 1 else: print('unknown character "{0}"'.format(direction)) houses[(position_x[santa], position_y[santa])] += 1 if santa == 1: santa = 0 else: santa = 1 print(len(houses.keys()))
import collections def f(n, result, s, str): if len(str) == n: result.add(str) return for c in s: if s[c] == 0: continue str += c s[c] -= 1 f(n, result, s, str) str = str[:-1] s[c] += 1 return s = input() n = len(s) s = collections.Counter(s) result = set() f(n, result, s, '') result = sorted(result) print(len(result)) for str in result: print(str)
def __init__(self): super().__init__() self.statistics = collections.Counter() self.hash_tags = collections.Counter() print("console initialized")
def __init__(self): self.counts = collections.Counter() self.means = collections.defaultdict(float) self.vars = collections.defaultdict(float)
def create_accumulator(self): return {'total_count': 0, 'lang_count': collections.Counter()}
# You're using Python Standard Library? import collections # But that's cheating!! import re # Little classes are like snowflakes, there are no two alike. Phrase = type( "", (str, ), { "word_count": lambda self: collections.Counter(re.findall("\w+", self.lower())) })
with open(os.path.join(outputFile5, mot[0] + ".json"), "w") as file : json.dump(index, file, indent=4) total = 0 ; for folder, subs, files in os.walk(inputFile): for mailFile in files: total += 1 ; print("\rPas de crash ! mail : " + str(total), end="") try : cibleMail = os.path.join(folder, mailFile) mail = open(cibleMail).read() b = email.message_from_string(mail).get_payload() msg = email.message_from_file(open(cibleMail)) if msg['Subject']: Subject = msg['Subject'] b = Subject + " " + b arr = re.split("\W+", b) arr = [mot.lower() for mot in arr] counter = collections.Counter(arr) for mot,nb in counter.items(): if 0 < len(mot) < 4 : index_less_than_four_letters_insert(mot, nb, cibleMail) elif len(mot) > 5 : index_more_than_five_letters_insert(mot, nb, cibleMail) elif 3 < len(mot) < 6 : index_four_or_five_letters_insert(mot,nb, cibleMail) except IOError: print(IOError) print("\n")
def has_anagrams(phrase): counters = [collections.Counter(word) for word in phrase] for i, word_count in enumerate(counters): if word_count in counters[i+1:]: return(True) return(False)
def most_common_tags(self): print(f"{gr}[+]{nu} user uploads tags : \n") for key, value in collections.Counter(self.tags).most_common(): print(f"{gr}{key} : {nu}{value}")
def main(params): """ :param params: :return: coco_pred_sg: {'rela_matrix': array([[0., 2., 425.]), 'obj_attr': array([[0., 36., 313.])} coco_spice_sg: {'rela_info': array([[5.0000e+00, 0.0000e+00, 1.0414e+04]), 'obj_info': [[179], [833], [1018], [1092], [3788], [5989, 6623, 4081], [7128], [7372, 5989]]} """ imgs = json.load(open(params['input_json'], 'r'))['images'] # category to supercategory cat2supercat = {} text = open(params['category_txt'], 'r').readlines() for line in text: line = line.strip().split(',') assert len(line) >= 1 for i in range(len(line)): cat2supercat[line[i]] = line[0] # detection categories det_dict = np.load(params['img_sg_dict'], allow_pickle=True)[()]['i2w'] sent_dict = np.load(params['sent_sg_dict'], allow_pickle=True)['spice_dict'][()]['ix_to_word'] # sentence scene graph and pre-trained image scene graph sg_img_dir = 'data/coco_img_sg' sg_snt_dir = 'data/coco_spice_sg2' # load predicates vocab if not None if os.path.isfile(params['pred_category']): print ('--loading predicates--') predicates = {key: i for i, (key, _) in enumerate(collections.Counter(json.load( open(params['pred_category'], 'r'))).most_common(params['top_predicates']))} else: predicates = None aligned_triplets = {} pred_candidate = [] if predicates is None: print('----------------collecting predicates---------------------') else: print('----------------collecting alignments---------------------') for img in tqdm(imgs): split = img['split'] if split not in ['train', 'restval'] and predicates is None: continue name = str(img['id']) + '.npy' sg_img_file = os.path.join(sg_img_dir, name) sg_snt_file = os.path.join(sg_snt_dir, name) sg_img_use = np.load(sg_img_file, encoding='latin1', allow_pickle=True)[()] sg_snt_use = np.load(sg_snt_file, encoding='latin1', allow_pickle=True)[()] sg_snt_rela = sg_snt_use['rela_info'].astype(int) sg_snt_obj = sg_snt_use['obj_info'] sg_img_obj_set = set( [det_dict[ele] for ele in set(sg_img_use['obj_attr'].astype(int)[:, 1].reshape(-1).tolist())]) sg_img_obj = [det_dict[ele] for ele in sg_img_use['obj_attr'].astype(int)[:, 1].reshape(-1).tolist()] tmp = dict() for snt_rela in sg_snt_rela: sub_ix, obj_ix, pred_ix = snt_rela sub, obj = sg_snt_obj[sub_ix], sg_snt_obj[obj_ix] sub, obj, pred = decode(sub, obj, pred_ix, sent_dict) if predicates is not None and pred in predicates: all_pairs = get_index(sub, obj, sg_img_obj, cat2supercat) if len(all_pairs) != 0: tmp[predicates[pred]] = all_pairs if predicates is None and check(sub, obj, sg_img_obj_set, cat2supercat): pred_candidate.append(pred) if predicates is not None: aligned_triplets[name.split('.')[0]] = tmp if predicates is None: cnt_pred = collections.Counter(pred_candidate) json.dump(dict(cnt_pred), open(params['pred_category'], 'w')) else: json.dump(aligned_triplets, open(params['aligned_triplets'], 'w'), indent=4, sort_keys=True, default=str) get_dict_file(params) return
def part_one(): counted_layers = [collections.Counter(layer) for layer in layers] sorted_layers = sorted(counted_layers, key=lambda x: x['0']) return sorted_layers[0]['1'] * sorted_layers[0]['2']
for function_name in func_names: verbs.append(get_verbs_from_function_name(function_name)) return make_flat_list(verbs) def get_verbs_from_function_name(function_name): return [word for word in function_name.split('_') if is_verb(word)] if __name__ == "__main__": wds = [] projects = [ 'projects/django', ] ''' 'flask', 'pyramid', 'reddit', 'requests', 'sqlalchemy', ] ''' for project in projects: path = os.path.join('.', project) get_top_verbs_in_path(path) wds += get_top_verbs_in_path(path) top_size = 200 print('total {} words, {} unique'.format(len(wds), len(set(wds)))) for word, occurence in collections.Counter(wds).most_common(top_size): print(word, occurence)
def insert_card(card_number, card_list): i = 0 while i < card_number: card_list.append(input()) i += 1 blue_card_number = int(input()) blue_card_list = [] insert_card(blue_card_number, blue_card_list) red_card_number = int(input()) red_card_list = [] insert_card(red_card_number, red_card_list) blue_card_collection = collections.Counter(blue_card_list) red_card_collection = collections.Counter(red_card_list) result_card_collection = blue_card_collection - red_card_collection answer_collection = sorted(result_card_collection.items(), key=lambda x: x[1], reverse=True) if answer_collection == []: print(0) else: print(answer_collection[0][1])