コード例 #1
0
>>> 160 <= dataset_counts['Journal Archives'] <= 180
True
>>> 100 <= dataset_counts['BMRB'] <= 120
True
>>> dataset_counts['treebase']
0
'''
from collections import defaultdict
from process_dataset_list import clean_repo_name

dataset_counts = defaultdict(lambda: 0)

with open('data/all_datasets.tsv', 'r') as input_file:
    input_file.readline()

    for line in input_file:
        line = line.strip()
        if not line or line.startswith('#'): continue

        vals = line.split('\t')
        repo, n = vals[0], int(vals[2]) + int(vals[3])

        repo = clean_repo_name(repo)
        if repo is None: continue

        if n: dataset_counts[repo] += 1

if __name__ == '__main__':
    for key in sorted(dataset_counts):
        print key + '\t' + str(dataset_counts[key])
コード例 #2
0
ファイル: all_datasets.py プロジェクト: lsheble/1000-datasets
year_regex = re.compile('[(\[ ]([1-2][0-9]{3})[)\].]')

# ls data_files = fnmatch.filter(os.listdir('data/repo_datasets/'), 
# ls                          '*_datasets.csv')

data_files = fnmatch.filter(os.listdir('data/cleaner_old_all_datasets/'), 
                            '*.tsv')

if __name__ == '__main__':
    print 'repo\tid\twos\tgs\tyear'
    
    for data_file in data_files:
# ls         path = os.path.join('data/repo_datasets', data_file)
        path = os.path.join('data/cleaner_old_all_datasets', data_file)
        repo = clean_repo_name(data_file[:-len('_datasets.csv')])
        with open(path) as input_file:
            r = csv.reader(input_file)
            header = next(r)
            
            # find the WoS citation column by looking at the column titles
            wos_cols = [n for n, x in enumerate(header) 
                        if 'wos cited by how many' in x.lower()]
            assert len(wos_cols) == 1
            wos_col = wos_cols[0]
            
            # find the GS search results column by looking at the column titles
            gs_cols = [n for n, x in enumerate(header)
                       if 'results' in x.lower()
                       and ('gs' in x.lower() or 'google' in x.lower()
                            or 'search' in x.lower())]
コード例 #3
0
for key in keys:
    with open('data/reuse_subsample_%s' % key) as input_file:
        for line in input_file:
            line = line.rstrip()
            if not line or line.startswith('#'): continue

            try:            
                count = count_re.findall(line)[0]
            except IndexError:
                raise Exception("Invalid input. reuse_subsample_%s should contain output from uniq -c." % key)
            n = int(count)
            line = line[len(count):].rstrip('\n')
            confidence, reuse_status, repo = line.split('\t')
            if not reuse_status: continue
            repo = clean_repo_name(repo)
            if repo is None: continue
            
            if 'low' in confidence:
                pass
            elif 'not reused' in reuse_status:
                pass
            elif 'reused' in reuse_status:
                reuse_counts[key][repo] += n
            
            candidate_counts[key][repo] += n
        
if __name__ == '__main__':
    for key in keys:
        for repo in sorted(reuse_counts):
            print key + '\t' + repo + '\t' + str(reuse_counts[key][repo]) + '\t' + str(candidate_counts[key][repo])
コード例 #4
0
import os
import fnmatch
import re
from process_dataset_list import clean_repo_name

year_regex = re.compile('[(\[ ]([1-2][0-9]{3})[)\].]')

data_files = fnmatch.filter(os.listdir('data/repo_datasets/'), 
                            '*_datasets.csv')

if __name__ == '__main__':
    print 'repo\tid\twos\tgs\tyear'
    
    for data_file in data_files:
        path = os.path.join('data/repo_datasets', data_file)
        repo = clean_repo_name(data_file[:-len('_datasets.csv')])
        with open(path) as input_file:
            r = csv.reader(input_file)
            header = next(r)
            
            # find the WoS citation column by looking at the column titles
            wos_cols = [n for n, x in enumerate(header) 
                        if 'wos cited by how many' in x.lower()]
            assert len(wos_cols) == 1
            wos_col = wos_cols[0]
            
            # find the GS search results column by looking at the column titles
            gs_cols = [n for n, x in enumerate(header)
                       if 'results' in x.lower()
                       and ('gs' in x.lower() or 'google' in x.lower()
                            or 'search' in x.lower())]