Example #1
0
def flatten(nested_list):
    """
    Return a flattened list of items from a nested list.
    """
    lst = []
    for item in nested_list:
        if isinstance(item, (list, tuple)):
            lst.extend(flatten(item))
        else:
            lst.append(item)
    return lst


# Store names that have been previously assigned.
name_heap = set([None])
prefix_counts = collections.Counter()


def reset_get_unique_name():
    """Reset the heaps that store previously-assigned names."""
    global name_heap, prefix_counts
    name_heap = set([None])
    prefix_counts = collections.Counter()


def get_unique_name(lst, attrib, prefix, initial=None):
    """
    Return a name that doesn't collide with another in a list.

    This subroutine is used to generate unique part references (e.g., "R12")
    or unique net names (e.g., "N$5").
import collections
print(collections.Counter(''.join(input().split())).most_common(1)[0][1])
Example #3
0
def solution(participant, completion):
    answer = collections.Counter(participant) - collections.Counter(completion)
    return list(answer.keys())[0]
                      aspect=4)

# Assumption: the less people was in your family the faster you were to get to the boat. The more people they are the more managment is required. However, if you had no family members you might wanted to help others and therefore sacrifice.
#
# * The females traveling with up to 2 more family members had a higher chance to survive. However, a high variation of survival rate appears once family size exceeds 4 as mothers/daughters would search longer for the members and therefore the chanes for survival decrease.
# * Alone men might want to sacrifice and help other people to survive.

# ## 1.5 Survival rate by the title
# * Barplots show that roalties had normally 1st or 2nd class tickets. However, people with the title Master had mostly 3rd class. In fact, a title 'Master' was given to unmarried boys. You can see that the age of of people with this title is less than 13.
# * Women and roalties had higher survival rate. (There are only two titlted women in the train class and both have survived, I would put them into Mrs class)
# * The civils and reverends a lower one due to the fact that they had/wanted to help people.

# In[ ]:

train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
print(collections.Counter(train['Title']).most_common())
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
print()
print(collections.Counter(test['Title']).most_common())

# In[ ]:

tab = pd.crosstab(train['Title'], train['Pclass'])
print(tab)
tab_prop = tab.div(tab.sum(1).astype(float), axis=0)
tab_prop.plot(kind="bar", stacked=True)

# Investigate who were masters. The age is less than 12.

# In[ ]:
Example #5
0
def extract_expression(tumor, platform, gencode_version):

	"""
	The EXTRACT_EXPRESSION operation extracts expression values from TCGA for all the genes of interest and their candidate regulatory genes. Intermediate results files are exported locally during the execution of the function, while the final dataframes are returned as Pandas dataframes and exported locally in the Excel files 'Gene Expression - InterestGenes.xlsx' and 'Gene Expression - RegulatoryGenes.xlsx'.

	:param tumor: full name of the tumor of interest, encoded as a string (e.g. 'Ovarian Serous Cystadenocarcinoma', 'Breast Invasive Carcinoma', ...)
	:param platform: number identifying the sequencing platform (either 27 for the 27k probes sequencing platform or 450 for the 450k probes sequencing platform)
	:param gencode_version: number representing the GENCODE genomic annotations to use (currently, for assembly GRCh38, versions 22, 24 and 27 can be used)
	:return: two Pandas dataframes

	Example::
	
		import genereg as gr
		expr_interest_df, expr_regul_df = gr.GeneExpression.extract_expression(tumor='Ovarian Serous Cystadenocarcinoma', platform=27, gencode_version=22)
	"""

	# Check input parameters
	tcga_tumors = ["Acute Myeloid Leukemia","Adrenocortical Carcinoma","Bladder Urothelial Carcinoma","Brain Lower Grade Glioma" ,"Breast Invasive Carcinoma","Cervical Squamous Cell Carcinoma and Endocervical Adenocarcinoma","Cholangiocarcinoma","Colon Adenocarcinoma","Esophageal Carcinoma","Glioblastoma Multiforme","Head and Neck Squamous Cell Carcinoma","Kidney Chromophobe","Kidney Renal Clear Cell Carcinoma","Kidney Renal Papillary Cell Carcinoma","Liver Hepatocellular Carcinoma","Lung Adenocarcinoma","Lung Squamous Cell Carcinoma","Lymphoid Neoplasm Diffuse Large B-cell Lymphoma","Mesothelioma","Ovarian Serous Cystadenocarcinoma","Pancreatic Adenocarcinoma","Pheochromocytoma and Paraganglioma","Prostate Adenocarcinoma","Rectum Adenocarcinoma","Sarcoma","Skin Cutaneous Melanoma","Stomach Adenocarcinoma","Testicular Germ Cell Tumors","Thymoma","Thyroid Carcinoma","Uterine Carcinosarcoma","Uterine Corpus Endometrial Carcinoma","Uveal Melanoma"]
	if tumor not in tcga_tumors:
		raise ValueError('PATHOLOGY NOT SUPPORTED! You can analyze one of these 33 types of TCGA tumors: '+(', '.join(tcga_tumors)))
	
	if platform not in [27, 450]:
		raise ValueError('PLATFORM NOT RECOGNIZED! Sequencing platforms available: 27 and 450')
	
	if gencode_version not in [22, 24, 27]:
		raise ValueError('GRCh38 GENCODE versions available are 22, 24 and 27')
	
	
	# Load the list of genes of interest
	EntrezConversion_df = pd.read_excel('./Genes_of_Interest.xlsx',sheetname='Sheet1',header=0,converters={'GENE_SYMBOL':str,'ENTREZ_GENE_ID':str,'GENE_SET':str})
	
	# Create a list containing the Gene Symbols of the genes of interest
	genesSYM_of_interest = []
	for i, r in EntrezConversion_df.iterrows():
		sym = r['GENE_SYMBOL']
		if sym not in genesSYM_of_interest:
			genesSYM_of_interest.append(sym)

	# Import the dictionary of genes of interest with their candidate regulatory genes
	dict_RegulGenes = pickle.load(open('./2_Regulatory_Genes/dict_RegulGenes.p', 'rb'))

	# Import the gene-TFs mapping dataframe 
	Mapping_df = pd.read_excel('./0_Genes_Mapping/Genes Mapping.xlsx',sheetname='Sheet1',header=0,converters={'ENTREZ_GENE_ID':str,'HGNC_ID':str})

	# Create a list containing the Gene Symbols of the regulatory genes of genes of interest
	regulatory_genesSYM = []
	for key, value in dict_RegulGenes.items():
		for gene in value:  
			if gene not in regulatory_genesSYM:
				regulatory_genesSYM.append(gene)

	# Extract the list of distinct Gene Symbols mapped in the mapping table
	mapped_gene_SYMs = []
	for index, row in Mapping_df.iterrows():
		sym = row['GENE_SYMBOL']
		if sym not in mapped_gene_SYMs:
			mapped_gene_SYMs.append(sym)


	# Execute the query for the extraction of gene expression values on the remote server, using the PyGMQL Python library
	gl.set_remote_address('http://gmql.eu/gmql-rest/')
	gl.login()
	gl.set_mode('remote')

	# Load the TCGA datasets to be used in the query
	methylation_dataset = gl.load_from_remote(remote_name='GRCh38_TCGA_methylation', owner='public')  
	expression_dataset = gl.load_from_remote(remote_name='GRCh38_TCGA_gene_expression', owner='public') 

	# Identify the sequencing platform to be used
	if platform == 27:
		seq_platform = 'Illumina Human Methylation 27'
	elif platform == 450:
	    seq_platform = 'Illumina Human Methylation 450'
	
	# Extract all the samples for the current tumor and platform
	all_methyl = methylation_dataset.meta_select((methylation_dataset['manually_curated__cases__disease_type'] == tumor) & (methylation_dataset['manually_curated__platform'] == seq_platform) & ((methylation_dataset['biospecimen__bio__sample_type'] == 'Primary Tumor') | (methylation_dataset['biospecimen__bio__sample_type'] == 'Recurrent Tumor')) & (methylation_dataset['clinical__shared__history_of_neoadjuvant_treatment'] == 'No'))
	all_expr = expression_dataset.meta_select((expression_dataset['manually_curated__cases__disease_type'] == tumor) & ((expression_dataset['biospecimen__bio__sample_type'] == 'Primary Tumor') | (expression_dataset['biospecimen__bio__sample_type'] == 'Recurrent Tumor')) & (expression_dataset['clinical__shared__history_of_neoadjuvant_treatment'] == 'No'))

	# Gene Expression:
	expr_0 = all_expr.reg_project(field_list=['ensembl_gene_id','entrez_gene_id','gene_symbol','fpkm'])
	expr = expr_0.meta_select(semiJoinDataset=all_methyl, semiJoinMeta=['biospecimen__bio__bcr_sample_barcode'])

	# Materialize the results into a GDataframe
	expr_Gdf = expr.materialize('./(MaterializeResults)')


	# The result dataset is loaded as a GDataframe, an object containing two pandas dataframes, one for the region data and one for the metadata.
	# Get the two pandas dataframes:
	expr_df_regs = expr_Gdf.regs
	expr_df_meta = expr_Gdf.meta
	n_regs = len(expr_df_regs)
	n_samples = len(expr_df_meta)

	# Rename 'chr', 'start', and 'stop' columns header
	expr_df_regs.rename(columns={'chr':'chrom','start':'left','stop':'right'}, inplace=True)
	# Change index into progressive integer numbers and store the name of the sample in another column
	expr_df_regs['sample_id'] = expr_df_regs.index
	expr_df_regs.index = range(n_regs)

	# Convert unknown values (NaN) to empty strings
	expr_df_regs = expr_df_regs.fillna('')

	# Convert all the metadata values into strings, since they're encode as lists in Python
	col_names = []
	for name, values in expr_df_meta.iteritems():
		col_names.append(name)
	for index, row in expr_df_meta.iterrows():
		for c in col_names:
			list_val = row[c] # it's encoded as a list
			str_val = ''.join(list_val)  # convert the value stored as a list in a string
			expr_df_meta.set_value(index,c,str_val)

		
	# Since we have to extract the expression values for each distinct sample barcode (aliquot), we create a list containing these distinct identifiers
	expr_sample_barcodes_all = []
	for index, row in expr_df_meta.iterrows():
		barcode = row['biospecimen__bio__bcr_sample_barcode']    
		if barcode not in expr_sample_barcodes_all: # get distinct values
			expr_sample_barcodes_all.append(barcode)
        
	# Check which are repeated aliquots, if present
	all_aliqouts = []
	for index, row in expr_df_meta.iterrows():
		barcode = row['biospecimen__bio__bcr_sample_barcode']  
		all_aliqouts.append(barcode)
	multiple_aliquots = [item for item, count in collections.Counter(all_aliqouts).items() if count > 1]

	samples_to_remove = []
	expr_sample_barcodes = []
	if len(multiple_aliquots) != 0:    
		# Among the repeated aliquots, keep only the most recent ones (of 2013)
		for index, row in expr_df_meta.iterrows():
			year = row['biospecimen__bio__year_of_shipment']
			barcode = row['biospecimen__bio__bcr_sample_barcode']  
			if (barcode in multiple_aliquots) and year == '2011':
				expr_df_meta.drop(index, inplace=True)
				samples_to_remove.append(index)

		# Import the list of aliquots in the methylation dataset 
		text_file = open(common_aliquots, 'r')
		aliquots = text_file.read().split('\n')
		aliquots.remove('')
		text_file.close()
			
		# Extract the new list of distinct TCGA Aliquots to extract
		for index, row in expr_df_meta.iterrows():
			barcode = row['biospecimen__bio__bcr_sample_barcode'] 
			if barcode in aliquots:
				if barcode not in expr_sample_barcodes:
					expr_sample_barcodes.append(barcode)        
			else:
				expr_df_meta.drop(index, inplace=True)
				samples_to_remove.append(index)
			
		# Remove regions that corresponded to eliminated repeated aliquots
		expr_df_regs = expr_df_regs.loc[~(expr_df_regs['sample_id'].isin(samples_to_remove))].copy()

	else:
		expr_sample_barcodes = expr_sample_barcodes_all		

		
	# Export the metadata dataframe setting the TCGA aliquots as indexes.
	Metadata_df = expr_df_meta.copy()
	Metadata_df['id_sample'] = Metadata_df.index
	Metadata_df.set_index('biospecimen__bio__bcr_sample_barcode', inplace=True)
	writer = ExcelWriter('./3_TCGA_Data/Gene_Expression/EXPR (Metadata).xlsx')
	Metadata_df.to_excel(writer,'Sheet1')
	writer.save()	


	# Extract from the expression dataset all the regions that belong to genes of interest
	expr_df_regs_interest = expr_df_regs.loc[expr_df_regs['gene_symbol'].isin(genesSYM_of_interest)].copy()
	# Extract from the expression dataset all the regions that belong to regulatory genes of genes of interest
	expr_df_regs_regulatory = expr_df_regs.loc[expr_df_regs['gene_symbol'].isin(regulatory_genesSYM)].copy()


	# Gene expression values for each gene of interest:

	# Create a dictionary for storing all the gene expression values for each gene of interest and for each aliquot TCGA
	from collections import defaultdict
	dict_expr_interest = defaultdict(dict)

	for key, value in dict_expr_interest.items():
		value = defaultdict(list)

	# The main dictionary has the Gene Symbols of the genes of interest as keys and each gene has another dictionary as value, which, in turn, has the different aliquots as keys and lists as values.
	# The idea is having a list, containing all the fpkm values, for each gene in each TCGA aliquot.

	# Set the Gene Symbol as keys of the main dictionary
	for name in genesSYM_of_interest:
		dict_expr_interest[name] = {}

	# Set the names of the samples barcodes as keys for each dictionary set as value of a specific key (genes)
	for sample in expr_sample_barcodes:
		for k, v in dict_expr_interest.items():
			v[sample] = []
			
	# Set the values by appending the expression values for each gene of interest: these expression values (fpkm) can be found in the 'expr_df_regs_interest' dataframe
	for index, row in expr_df_regs_interest.iterrows():   # iterating along the whole dataframe
		sym = row['gene_symbol']  # get the Gene Symbol of the gene
		fpkm = row['fpkm']  # get the gene expression value
		sample = row['sample_id']  # get the name of the sample
		# get the aliquot corresponding to current sample
		aliq = expr_df_meta.get_value(sample, 'biospecimen__bio__bcr_sample_barcode')  
		# add the value according to the correct gene ID and TCGA aliquot, rounding it to a float with maximum 6 decimal numbers,
		dict_expr_interest[sym][aliq].append(round(float(fpkm),6))
		

	# Convert the nested dictionary also into a dataframe

	# Create a dataframe whose row indexes are the different TCGA samples and the columns are the distinct genes of interest
	expr_interest_df1 = pd.DataFrame(index = expr_sample_barcodes, columns = [genesSYM_of_interest])

	# Add three additional columns for the name of the sample and the ID and barcode of the patient corresponding to each aliquot, in order to have them available if we will need it
	expr_interest_df2 = pd.DataFrame(index = expr_sample_barcodes, columns = ['Sample_ID','Tumor','Patient_ID'])

	# Create the final dataframe
	expr_interest_df = expr_interest_df1.join(expr_interest_df2)

	# Fill the previously created dataframe with the correct gene expression values, for each gene of interest and for each TCGA aliquot            
	for gene_sym, dict_value in dict_expr_interest.items():
		for tcga_aliq, exp_list in dict_value.items():
			if (len(exp_list) != 0):
				fpkm = exp_list[0]
				# add the expression value in the proper cell of the dataframe, rounding it to a float with maximum 6 decimal numbers
				expr_interest_df.set_value(tcga_aliq,gene_sym,round(fpkm,6))
				

	# Add to the dataframe the name of each sample, the tumor code and the patient's ID in correspondence of each TCGA aliquot
	for index, row in expr_df_meta.iterrows():
		aliquot = row['biospecimen__bio__bcr_sample_barcode']
		tumor_tag = row['clinical__admin__disease_code']
		patient_id = row['clinical__shared__patient_id']
		expr_interest_df.set_value(aliquot,'Sample_ID',index)
		expr_interest_df.set_value(aliquot,'Tumor',tumor_tag)
		expr_interest_df.set_value(aliquot,'Patient_ID',patient_id)
		
	# Add a row at the beginning of the dataframe to insert also the Entrez Gene ID of each gene of interest
	additional_index = ['ENTREZ_GENE_ID']
	expr_interest_df0_1 = pd.DataFrame(index = additional_index, columns = [genesSYM_of_interest])
	expr_interest_df0_2 = pd.DataFrame(index = additional_index, columns = ['Sample_ID','Tumor','Patient_ID'])
	expr_interest_df0 = expr_interest_df0_1.join(expr_interest_df0_2)

	frames = [expr_interest_df0, expr_interest_df]
	expr_interest_df = pd.concat(frames)

	# Add for each Gene Symbol of our genes of interest the corresponding Entrez Gene ID in the first row of the dataframe
	for i, r in EntrezConversion_df.iterrows():
		entrez_id = r['ENTREZ_GENE_ID']
		gene_name = r['GENE_SYMBOL']
		expr_interest_df.set_value('ENTREZ_GENE_ID',gene_name,entrez_id)

	# Set empty strings for NaN values in the 'GENE_SYMBOL' row
	expr_interest_df.set_value('ENTREZ_GENE_ID','Sample_ID',"")
	expr_interest_df.set_value('ENTREZ_GENE_ID','Tumor',"")
	expr_interest_df.set_value('ENTREZ_GENE_ID','Patient_ID',"")


	# Export the dataframe with the gene expression values for our genes of interest for each TCGA aliquot 
	writer = ExcelWriter('./3_TCGA_Data/Gene_Expression/Gene Expression - InterestGenes.xlsx')
	expr_interest_df.to_excel(writer,'Sheet1')
	writer.save()


	# Gene expression values for each candidate regulatory gene of the genes of interest:

	# Create a dictionary for storing all the gene expression values for each gene of interest and for each aliquot TCGA
	from collections import defaultdict
	dict_expr_regulatory = defaultdict(dict)

	for key, value in dict_expr_regulatory.items():
		value = defaultdict(list)

	# The main dictionary has the Gene Symbols of the candidate regulatory genes as keys and each gene has another dictionary as value, which, in turn, has the different aliquots as keys and lists as values.
	# The idea is having a list, containing all the fpkm values, for each gene in each TCGA aliquot.

	# Set the Gene Symbols as keys of the main dictionary
	for name in regulatory_genesSYM:
		dict_expr_regulatory[name] = {}

	# Set the names of the samples barcodes as keys for each dictionary set as value of a specific key (genes)
	for sample in expr_sample_barcodes:
		for k, v in dict_expr_regulatory.items():
			v[sample] = []
        
	# Set the values by appending the expression values for each candidate regulatory gene: these expression values (fpkm) can be found in the "expr_df_regs_regulatory" dataframe
	for index, row in expr_df_regs_regulatory.iterrows():   # iterating along the whole dataframe
		sym = row['gene_symbol']  # get the Gene Symbol of the gene
		ens_id = row['ensembl_gene_id']  # get the Ensembl Gene ID
		fpkm = row['fpkm']  # get the gene expression value
		sample = row['sample_id']  # get the name of the sample
		# get the aliquot corresponding to current sample
		aliq = expr_df_meta.get_value(sample, 'biospecimen__bio__bcr_sample_barcode')
		# add the value according to the correct gene ID and TCGA aliquot, rounding it to a float with maximum 6 decimal numbers
		if (gencode_version == 22):
			if (ens_id not in ['ENSG00000277726.3','ENSG00000275895.3','ENSGR0000214717.8']):
				dict_expr_regulatory[sym][aliq].append(round(float(fpkm),6))
		else:
			dict_expr_regulatory[sym][aliq].append(round(float(fpkm),6))
	


	# Convert the nested dictionary also into a dataframe

	# Create a dataframe whose row indexes are the different TCGA samples and the columns are the distinct candidate regulatory genes
	expr_regulatory_df1 = pd.DataFrame(index = expr_sample_barcodes, columns = [regulatory_genesSYM])

	# Add three additional columns for the name of the sample and the ID and barcode of the patient corresponding to each aliquot, in order to have them available if we will need it
	expr_regulatory_df2 = pd.DataFrame(index = expr_sample_barcodes, columns = ['Sample_ID','Tumor','Patient_ID'])

	# Create the final dataframe
	expr_regulatory_df = expr_regulatory_df1.join(expr_regulatory_df2)

	# Fill the previously created dataframe with the correct gene expression values, for each candidate regulatory gene and for each TCGA aliquot            
	for gene_sym, dict_value in dict_expr_regulatory.items():
		for tcga_aliq, exp_list in dict_value.items():
			if (len(exp_list) != 0):
				fpkm = exp_list[0]
				# add the expression value in the proper cell of the dataframe, rounding it to a float with maximum 6 decimal numbers
				expr_regulatory_df.set_value(tcga_aliq,gene_sym,round(fpkm,6))
				

	# Add to the dataframe the name of each sample, the tumor code and the patient's ID in correspondence of each TCGA aliquot
	for index, row in expr_df_meta.iterrows():
		aliquot = row['biospecimen__bio__bcr_sample_barcode']
		tumor_tag = row['clinical__admin__disease_code']
		patient_id = row['clinical__shared__patient_id']
		expr_regulatory_df.set_value(aliquot,'Sample_ID',index)
		expr_regulatory_df.set_value(aliquot,'Tumor',tumor_tag)
		expr_regulatory_df.set_value(aliquot,'Patient_ID',patient_id)
		
	# Add a row at the beginning of the dataframe to insert also the Gene Symbols of each gene of interest
	additional_index = ['ENTREZ_GENE_ID']
	expr_regulatory_df0_1 = pd.DataFrame(index = additional_index, columns = [regulatory_genesSYM])
	expr_regulatory_df0_2 = pd.DataFrame(index = additional_index, columns = ['Sample_ID','Tumor','Patient_ID'])
	expr_regulatory_df0 = expr_regulatory_df0_1.join(expr_regulatory_df0_2)

	frames = [expr_regulatory_df0, expr_regulatory_df]
	expr_regulatory_df = pd.concat(frames)

	# Add for each Gene Symbol of the regulatory genes the corresponding Entrez Gene ID in the first row of the dataframe
	for i in regulatory_genesSYM:
		if i == 'PTRF':
			entrez_id = Mapping_df.loc[Mapping_df['GENE_SYMBOL'] == 'CAVIN1', 'ENTREZ_GENE_ID'].iloc[0]
		else:
			entrez_id = Mapping_df.loc[Mapping_df['GENE_SYMBOL'] == i, 'ENTREZ_GENE_ID'].iloc[0]
		expr_regulatory_df.set_value('ENTREZ_GENE_ID',i,entrez_id)

	# Set empty strings for NaN values in the 'GENE_SYMBOL' row
	expr_regulatory_df.set_value('ENTREZ_GENE_ID','Sample_ID',"")
	expr_regulatory_df.set_value('ENTREZ_GENE_ID','Tumor',"")
	expr_regulatory_df.set_value('ENTREZ_GENE_ID','Patient_ID',"")


	# Export the dataframe with the gene expression values for the regulatory genes of our genes of interest for each TCGA aliquot 
	writer = ExcelWriter('./3_TCGA_Data/Gene_Expression/Gene Expression - RegulatoryGenes.xlsx')
	expr_regulatory_df.to_excel(writer,'Sheet1')
	writer.save()
	
	return expr_interest_df, expr_regulatory_df
Example #6
0
import pickle
import re
import string
import sys

import numpy as np
from nltk import conlltags2tree, pos_tag, tree2conlltags, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.externals import joblib
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

ner_tags = collections.Counter()

basepath = os.path.dirname(__file__)
corpus_root = os.path.abspath(os.path.join(basepath, "gmb-2.2.0"))

# corpus_root = "gmb-2.2.0.zip"
# reload(sys)
# sys.setdefaultencoding('utf-8')


def read_gmb(corpus_root):
    for root, dirs, files in os.walk(corpus_root):
        for filename in files:
            if filename.endswith(".tags"):
                with open(os.path.join(root, filename), 'rb') as file_handle:
                    # file_handle = zipfile.ZipFile('gmb-2.2.0.zip', 'r')
Example #7
0
def compute_bleu(reference_corpus, translation_corpus, max_order=4,
                 smooth=False):
  """Computes BLEU score of translated segments against one or more references.
  Args:
    reference_corpus: list of lists of references for each translation. Each
        reference should be tokenized into a list of tokens.
    translation_corpus: list of translations to score. Each translation
        should be tokenized into a list of tokens.
    max_order: Maximum n-gram order to use when computing BLEU score.
    smooth: Whether or not to apply Lin et al. 2004 smoothing.
  Returns:
    3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
    precisions and brevity penalty.
  """
  matches_by_order = [0] * max_order
  possible_matches_by_order = [0] * max_order
  reference_length = 0
  translation_length = 0
  for (references, translation) in zip(reference_corpus,
                                       translation_corpus):
    reference_length += min(len(r) for r in references)
    translation_length += len(translation)

    merged_ref_ngram_counts = collections.Counter()
    for reference in references:
      merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
    translation_ngram_counts = _get_ngrams(translation, max_order)
    overlap = translation_ngram_counts & merged_ref_ngram_counts
    for ngram in overlap:
      matches_by_order[len(ngram)-1] += overlap[ngram]
    for order in range(1, max_order+1):
      possible_matches = len(translation) - order + 1
      if possible_matches > 0:
        possible_matches_by_order[order-1] += possible_matches

  precisions = [0] * max_order
  for i in range(0, max_order):
    if smooth:
      precisions[i] = ((matches_by_order[i] + 1.) /
                       (possible_matches_by_order[i] + 1.))
    else:
      if possible_matches_by_order[i] > 0:
        precisions[i] = (float(matches_by_order[i]) /
                         possible_matches_by_order[i])
      else:
        precisions[i] = 0.0

  if min(precisions) > 0:
    p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
    geo_mean = math.exp(p_log_sum)
  else:
    geo_mean = 0

  ratio = float(translation_length) / reference_length

  if ratio > 1.0:
    bp = 1.
  else:
    if ratio > 1E-1:
        bp = math.exp(1 - 1. / ratio)
    else:
        bp = 1E-2

  bleu = geo_mean * bp

  return (bleu, precisions, bp, ratio, translation_length, reference_length)
Example #8
0
def get_vocab_imdb(data):
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return text.vocab.Vocabulary(counter, min_freq=5)
Example #9
0
    len_matter_within_trainig_data_list.append(len(elements_dict))
del vector_element
tc_list = tc_list[index_to_use]
pn_list = pn_list[index_to_use]
super_conductor_family_index = super_conductor_family_index[index_to_use]
exit()
# organic_index=organic_index[index_to_use]

if len(tc_list) != len(super_conductor_family_index):
    print('the length of two lists is not equal.')
    exit()
print('the length of the list is ', len(tc_list))

os.chdir(save_dir)
print('problemetic element', np.unique(problematic_elements))
print('frequency', collections.Counter(problematic_elements))

np.save('./made_data/super_conductor_family_index',
        super_conductor_family_index)
print('super_conductor_family_index.shape', super_conductor_family_index.shape)
# np.save('./made_data/organic_index', organic_index)
print('the length of non_organic is', count_non_organic)

# problematic_materials_list = pd.DataFrame(
#     {'material': problematic_materials, 'element': problematic_elements, 'Tc': problematic_tc})
# filepath = os.path.join(save_dir,'./made_data' ,'problematic_elements.csv')
# problematic_materials_list.to_csv(filepath,index=False)

matter_to_use_list = pd.DataFrame({
    'element': matter_name_to_use,
    'Tc': matter_tc_to_use
 def __init__(self, time_frame=1):
     self.time_frame = time_frame
     self.frame_q = collections.deque()
     self.id_counts = collections.Counter()
 def __init__(self):
     self.observed_idcounts = collections.Counter()
     self.observed_system_entropy = 0
Example #12
0
def sanitize_var_map(op_arguments, arguments, precision=None, device=None):
    '''
    Sanitizes a dictionary of `Variable` s to input data such that it can be
    handed off to the evaluation methods
    (:meth:`~cntk.ops.functions.Function.forward`,
    :meth:`~cntk.ops.functions.Function.backward`, :meth:`~cntk.Trainer.train_minibatch` and
    :meth:`~cntk.Trainer.test_minibatch`).

    Args:
        op_arguments (:class:`~cntk.ops.functions.Function`): arguments of the root function. In
         :meth:`~cntk.ops.functions.Function.forward` pass it is typically
         `op.arguments`, in :meth:`~cntk.ops.functions.Function.backward` pass it is
         `op.outputs`
        arguments: maps variables to their input data. The interpretation depends on
         the input type:

           * dict: keys are input variable or names, and values are the input data.
           * any other type: if node has an unique input, arguments is
             mapped to this input.
         For nodes with more than one input, only dict is allowed.

         In both cases, every every sample in the data will be interpreted
         as a new sequence.

         Sequences can be marked as continuations of the same sequence in
         the previous minibatch (that is the sequence in the same slot).
         There are two possibilities for this:

          * specifying arguments as a `tuple` where the first element is
            used as arguments and the second one will be used as a list
            of bools, denoting whether a sequence is a new one (`True`) or a
            continuation of the sequence in the same slot of the previous
            minibatch (`False`). This will be applied to all batches.
          * specifying arguments as a dictionary of variables to tuples
            where the first element is used as arguments and the second
            one will be used as a list of bools, denoting whether a sequence
            is a new one (`True`) or a continuation of the sequence in the
            same slot of the previous minibatch (`False`). This will be
            applied to all batches.

         Data should be either NumPy arrays or a
         :class:`~cntk.io.MinibatchData` instance.
        precision (str or `np.float32` or `np.float64`): if string it can be
         one of 'float' 'float32, 'double', 'float64', or None
        device (:class:`~cntk.device.DeviceDescriptor`, default None): device
         this value should be put on

    Returns:
        `dict` that maps variables to sanitized batches
    '''
    from ..io import MinibatchData

    if isinstance(arguments, tuple):
        arguments, seq_starts = arguments
    else:
        seq_starts = None

    if arguments is None or isinstance(arguments,
                                       (dict, list)) and len(arguments) == 0:
        if len(op_arguments) > 0:
            raise ValueError('function expects %i arguments' %
                             len(op_arguments))
        return {}

    if len(arguments) < len(op_arguments):
        raise ValueError('your graph has %i inputs, but you specified %i' %
                         (len(op_arguments), len(arguments)))

    if isinstance(arguments, dict):
        arg_names = [var.name for var in op_arguments]
        name_counter = collections.Counter(arg_names)

        var_name_map = dict((var.name, var) for var in op_arguments)
    else:
        if len(op_arguments) == 1:
            name_counter = collections.Counter([op_arguments[0].name])
            var_name_map = dict([(op_arguments[0].name, op_arguments[0])])
            arguments = dict([(op_arguments[0], arguments)])
        else:
            raise ValueError(
                'non-dict argument (%s) is not supported for nodes with more than one input'
                % type(arguments).__name__)

    if precision is not None:
        precision = sanitize_precision(precision)

    var_map = {}
    for var, batch in arguments.items():
        if isinstance(var, str):
            if name_counter[var] == 0:
                raise ValueError(
                    'variable with name "%s" does not exist in the network. Available variable names: %s'
                    % (var, ", ".join(var_name_map)))
            elif name_counter[var] > 1:
                raise ValueError('node name "%s" is not unique' % var)

            try:
                var = var_name_map[var]
            except KeyError:
                raise KeyError(
                    "no input with the name '%s' was found.  Available: %s" %
                    (var, ", ".join(var_name_map.keys())))

        if isinstance(batch, tuple):
            if seq_starts is not None:
                raise ValueError(
                    'you cannot provide sequence start '
                    'information globally and for individual batches '
                    'at the same time')

            batch, seq_starts = batch

            if seq_starts is not None:
                if not isinstance(seq_starts, (tuple, list)):
                    raise ValueError(
                        'if you specify sequence begin markers, it needs to be a list'
                    )

                sample_size = batch.shape[0] if hasattr(
                    batch, 'shape') else len(batch)

                if len(seq_starts) != sample_size:
                    raise ValueError('you have %i sequences, but only %i '
                                     'sequence begin markers' %
                                     (sample_sizes, len(seq_starts)))

        if isinstance(batch, MinibatchData):
            batch = batch.m_data
        elif not isinstance(batch, cntk_py.Value):
            batch = sanitize_batch(var, batch, seq_starts, device)

        var_map[var] = batch

    return var_map
Example #13
0
def mode(data):
    """Return the most common data item. If there are ties, return any one of them."""
    [(item, count)] = collections.Counter(data).most_common(1)
    return item
Example #14
0
def reset_get_unique_name():
    """Reset the heaps that store previously-assigned names."""
    global name_heap, prefix_counts
    name_heap = set([None])
    prefix_counts = collections.Counter()
Example #15
0
import itertools
import collections

N = int(input())
A = list(map(int, input().split()))
acum = list(itertools.accumulate(A, initial=0))
c = collections.Counter(acum)
print(sum(n * (n - 1) // 2 for n in c.values()))  # nC2
Example #16
0
import sys
import json
import re
import collections
import os
import datetime

total_playlists = 0
total_tracks = 0
tracks = set()
artists = set()
albums = set()
titles = set()
total_descriptions = 0
ntitles = set()
title_histogram = collections.Counter()
artist_histogram = collections.Counter()
track_histogram = collections.Counter()
last_modified_histogram = collections.Counter()
num_edits_histogram = collections.Counter()
playlist_length_histogram = collections.Counter()
num_followers_histogram = collections.Counter()

quick = False
max_files_for_quick_processing = 5


def process_mpd(path):
    count = 0
    filenames = os.listdir(path)
    for filename in sorted(filenames):
Example #17
0
        occupationList.append(occupationList1[0])

#Output all fetched occupation in Output

dd = df.ix[0:, 0:3]
dd['Occupation'] = pd.Series(occupationList, index=df.index)
writer = pd.ExcelWriter('output.xlsx')
dd.to_excel(writer)
writer.save()

print("Final List")
print(occupationList)

#Print count of occupation
counts = collections.Counter(occupationList)
print(counts)

from pytagcloud import create_tag_image, make_tags
from pytagcloud.lang.counter import get_tag_counts
j = ""
for i in occupationList:
    if (i != 'no_occupation'):
        j = j + " " + i
print(j)

# Let's plot the result
import wordcloud
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
import numpy as np
Example #18
0
#/usr/bin/env python
import collections

with open('input/day_3') as file:
    data = file.read()

houses = collections.Counter()
position_x = [0, 0]
position_y = [0, 0]
houses[(0, 0)] = 2
santa = 0
for direction in data.strip():
    if direction == '^':
        position_y[santa] += 1
    elif direction == 'v':
        position_y[santa] -= 1
    elif direction == '>':
        position_x[santa] += 1
    elif direction == '<':
        position_x[santa] -= 1
    else:
        print('unknown character "{0}"'.format(direction))
    houses[(position_x[santa], position_y[santa])] += 1
    if santa == 1:
        santa = 0
    else:
        santa = 1

print(len(houses.keys()))
Example #19
0
import collections


def f(n, result, s, str):
    if len(str) == n:
        result.add(str)
        return
    for c in s:
        if s[c] == 0:
            continue
        str += c
        s[c] -= 1
        f(n, result, s, str)
        str = str[:-1]
        s[c] += 1
    return


s = input()
n = len(s)
s = collections.Counter(s)
result = set()
f(n, result, s, '')
result = sorted(result)
print(len(result))
for str in result:
    print(str)
Example #20
0
 def __init__(self):
     super().__init__()
     self.statistics = collections.Counter()
     self.hash_tags = collections.Counter()
     print("console initialized")
Example #21
0
 def __init__(self):
     self.counts = collections.Counter()
     self.means = collections.defaultdict(float)
     self.vars = collections.defaultdict(float)
Example #22
0
 def create_accumulator(self):
   return {'total_count': 0, 'lang_count': collections.Counter()}
# You're using Python Standard Library?
import collections
# But that's cheating!!
import re

# Little classes are like snowflakes, there are no two alike.
Phrase = type(
    "", (str, ), {
        "word_count":
        lambda self: collections.Counter(re.findall("\w+", self.lower()))
    })
    with open(os.path.join(outputFile5, mot[0] + ".json"), "w") as file :
        json.dump(index, file, indent=4)
total = 0 ;
for folder, subs, files in os.walk(inputFile):
        for mailFile in files:
            total += 1 ;
            print("\rPas de crash ! mail : " + str(total), end="") 
            try :
                cibleMail = os.path.join(folder, mailFile)
                mail = open(cibleMail).read()
                b = email.message_from_string(mail).get_payload()
                msg = email.message_from_file(open(cibleMail))
                if msg['Subject']:
                    Subject = msg['Subject']
                    b = Subject + " " + b
                arr = re.split("\W+", b)
                arr = [mot.lower() for mot in arr]
                counter = collections.Counter(arr)
                for mot,nb in counter.items():
                    if 0 < len(mot) < 4 :
                        index_less_than_four_letters_insert(mot, nb, cibleMail)
                    elif len(mot) > 5 :
                        index_more_than_five_letters_insert(mot, nb, cibleMail)
                    elif 3 < len(mot) < 6 :
                        index_four_or_five_letters_insert(mot,nb, cibleMail)
            except IOError:
                print(IOError)
        print("\n")


Example #25
0
def has_anagrams(phrase):
    counters = [collections.Counter(word) for word in phrase]
    for i, word_count in enumerate(counters):
        if word_count in counters[i+1:]:
            return(True)
    return(False)
Example #26
0
	def most_common_tags(self):
		print(f"{gr}[+]{nu} user uploads tags : \n")
		for key, value in collections.Counter(self.tags).most_common():
			print(f"{gr}{key} : {nu}{value}")
def main(params):
    """
        :param params:
        :return:
        coco_pred_sg:
        {'rela_matrix': array([[0., 2., 425.]),
           'obj_attr': array([[0., 36., 313.])}

        coco_spice_sg:
        {'rela_info': array([[5.0000e+00, 0.0000e+00, 1.0414e+04]),
          'obj_info': [[179], [833], [1018], [1092], [3788], [5989, 6623, 4081], [7128], [7372, 5989]]}
    """
    imgs = json.load(open(params['input_json'], 'r'))['images']
    # category to supercategory
    cat2supercat = {}
    text = open(params['category_txt'], 'r').readlines()
    for line in text:
        line = line.strip().split(',')
        assert len(line) >= 1
        for i in range(len(line)):
            cat2supercat[line[i]] = line[0]

    # detection categories
    det_dict = np.load(params['img_sg_dict'], allow_pickle=True)[()]['i2w']
    sent_dict = np.load(params['sent_sg_dict'], allow_pickle=True)['spice_dict'][()]['ix_to_word']

    # sentence scene graph and pre-trained image scene graph
    sg_img_dir = 'data/coco_img_sg'
    sg_snt_dir = 'data/coco_spice_sg2'

    # load predicates vocab if not None
    if os.path.isfile(params['pred_category']):
        print ('--loading predicates--')
        predicates = {key: i for i, (key, _) in enumerate(collections.Counter(json.load(
                                    open(params['pred_category'], 'r'))).most_common(params['top_predicates']))}
    else:
        predicates = None
    aligned_triplets = {}
    pred_candidate = []

    if predicates is None:
        print('----------------collecting predicates---------------------')
    else:
        print('----------------collecting alignments---------------------')

    for img in tqdm(imgs):
        split = img['split']
        if split not in ['train', 'restval'] and predicates is None:
            continue
        name = str(img['id']) + '.npy'

        sg_img_file = os.path.join(sg_img_dir, name)
        sg_snt_file = os.path.join(sg_snt_dir, name)
        sg_img_use = np.load(sg_img_file, encoding='latin1', allow_pickle=True)[()]
        sg_snt_use = np.load(sg_snt_file, encoding='latin1', allow_pickle=True)[()]
        sg_snt_rela = sg_snt_use['rela_info'].astype(int)
        sg_snt_obj = sg_snt_use['obj_info']

        sg_img_obj_set = set(
            [det_dict[ele] for ele in set(sg_img_use['obj_attr'].astype(int)[:, 1].reshape(-1).tolist())])
        sg_img_obj = [det_dict[ele] for ele in sg_img_use['obj_attr'].astype(int)[:, 1].reshape(-1).tolist()]

        tmp = dict()
        for snt_rela in sg_snt_rela:
            sub_ix, obj_ix, pred_ix = snt_rela
            sub, obj = sg_snt_obj[sub_ix], sg_snt_obj[obj_ix]
            sub, obj, pred = decode(sub, obj, pred_ix, sent_dict)

            if predicates is not None and pred in predicates:
                all_pairs = get_index(sub, obj, sg_img_obj, cat2supercat)
                if len(all_pairs) != 0:
                    tmp[predicates[pred]] = all_pairs

            if predicates is None and check(sub, obj, sg_img_obj_set, cat2supercat):
                pred_candidate.append(pred)

        if predicates is not None:
            aligned_triplets[name.split('.')[0]] = tmp

    if predicates is None:
        cnt_pred = collections.Counter(pred_candidate)
        json.dump(dict(cnt_pred), open(params['pred_category'], 'w'))
    else:
        json.dump(aligned_triplets, open(params['aligned_triplets'], 'w'), indent=4, sort_keys=True, default=str)
        get_dict_file(params)

    return
Example #28
0
def part_one():
    counted_layers = [collections.Counter(layer) for layer in layers]
    sorted_layers = sorted(counted_layers, key=lambda x: x['0'])
    return sorted_layers[0]['1'] * sorted_layers[0]['2']
Example #29
0
    for function_name in func_names:
        verbs.append(get_verbs_from_function_name(function_name))
    return make_flat_list(verbs)


def get_verbs_from_function_name(function_name):
    return [word for word in function_name.split('_') if is_verb(word)]


if __name__ == "__main__":
    wds = []
    projects = [
        'projects/django',
    ]
    ''' 'flask',
      'pyramid',
      'reddit',
      'requests',
      'sqlalchemy',
  
  ]
  '''
    for project in projects:
        path = os.path.join('.', project)
        get_top_verbs_in_path(path)
        wds += get_top_verbs_in_path(path)
    top_size = 200
    print('total {} words, {} unique'.format(len(wds), len(set(wds))))
    for word, occurence in collections.Counter(wds).most_common(top_size):
        print(word, occurence)
Example #30
0

def insert_card(card_number, card_list):
    i = 0
    while i < card_number:
        card_list.append(input())
        i += 1


blue_card_number = int(input())
blue_card_list = []
insert_card(blue_card_number, blue_card_list)

red_card_number = int(input())
red_card_list = []
insert_card(red_card_number, red_card_list)

blue_card_collection = collections.Counter(blue_card_list)
red_card_collection = collections.Counter(red_card_list)

result_card_collection = blue_card_collection - red_card_collection

answer_collection = sorted(result_card_collection.items(),
                           key=lambda x: x[1],
                           reverse=True)

if answer_collection == []:
    print(0)
else:
    print(answer_collection[0][1])