>>> 160 <= dataset_counts['Journal Archives'] <= 180 True >>> 100 <= dataset_counts['BMRB'] <= 120 True >>> dataset_counts['treebase'] 0 ''' from collections import defaultdict from process_dataset_list import clean_repo_name dataset_counts = defaultdict(lambda: 0) with open('data/all_datasets.tsv', 'r') as input_file: input_file.readline() for line in input_file: line = line.strip() if not line or line.startswith('#'): continue vals = line.split('\t') repo, n = vals[0], int(vals[2]) + int(vals[3]) repo = clean_repo_name(repo) if repo is None: continue if n: dataset_counts[repo] += 1 if __name__ == '__main__': for key in sorted(dataset_counts): print key + '\t' + str(dataset_counts[key])
year_regex = re.compile('[(\[ ]([1-2][0-9]{3})[)\].]') # ls data_files = fnmatch.filter(os.listdir('data/repo_datasets/'), # ls '*_datasets.csv') data_files = fnmatch.filter(os.listdir('data/cleaner_old_all_datasets/'), '*.tsv') if __name__ == '__main__': print 'repo\tid\twos\tgs\tyear' for data_file in data_files: # ls path = os.path.join('data/repo_datasets', data_file) path = os.path.join('data/cleaner_old_all_datasets', data_file) repo = clean_repo_name(data_file[:-len('_datasets.csv')]) with open(path) as input_file: r = csv.reader(input_file) header = next(r) # find the WoS citation column by looking at the column titles wos_cols = [n for n, x in enumerate(header) if 'wos cited by how many' in x.lower()] assert len(wos_cols) == 1 wos_col = wos_cols[0] # find the GS search results column by looking at the column titles gs_cols = [n for n, x in enumerate(header) if 'results' in x.lower() and ('gs' in x.lower() or 'google' in x.lower() or 'search' in x.lower())]
for key in keys: with open('data/reuse_subsample_%s' % key) as input_file: for line in input_file: line = line.rstrip() if not line or line.startswith('#'): continue try: count = count_re.findall(line)[0] except IndexError: raise Exception("Invalid input. reuse_subsample_%s should contain output from uniq -c." % key) n = int(count) line = line[len(count):].rstrip('\n') confidence, reuse_status, repo = line.split('\t') if not reuse_status: continue repo = clean_repo_name(repo) if repo is None: continue if 'low' in confidence: pass elif 'not reused' in reuse_status: pass elif 'reused' in reuse_status: reuse_counts[key][repo] += n candidate_counts[key][repo] += n if __name__ == '__main__': for key in keys: for repo in sorted(reuse_counts): print key + '\t' + repo + '\t' + str(reuse_counts[key][repo]) + '\t' + str(candidate_counts[key][repo])
import os import fnmatch import re from process_dataset_list import clean_repo_name year_regex = re.compile('[(\[ ]([1-2][0-9]{3})[)\].]') data_files = fnmatch.filter(os.listdir('data/repo_datasets/'), '*_datasets.csv') if __name__ == '__main__': print 'repo\tid\twos\tgs\tyear' for data_file in data_files: path = os.path.join('data/repo_datasets', data_file) repo = clean_repo_name(data_file[:-len('_datasets.csv')]) with open(path) as input_file: r = csv.reader(input_file) header = next(r) # find the WoS citation column by looking at the column titles wos_cols = [n for n, x in enumerate(header) if 'wos cited by how many' in x.lower()] assert len(wos_cols) == 1 wos_col = wos_cols[0] # find the GS search results column by looking at the column titles gs_cols = [n for n, x in enumerate(header) if 'results' in x.lower() and ('gs' in x.lower() or 'google' in x.lower() or 'search' in x.lower())]