Esempio n. 1
0
cdr_path = data_path + 'cdr/'
bc_path = data_path + 'bc/'

embeddings_file = data_path + 'embeddings/glove.6B/glove.6B.100d.txt'

set_labels = {
    'cdr': ['Chemical', 'Disease'],
    'bc': ['Chemical', 'Gene'],
    'weak': [
        'Disease', 'Chemical', 'Species', 'Gene', 'ProteinMutation',
        'DNAMutation', 'SNP'
    ]
}

dp = DataProcessor(set_labels=set_labels,
                   vocab=embeddings_file,
                   window_size=window_size)

dp.read_file(cdr_path + 'ner_CID_Training_mine_PubTator.txt',
             'cdr_train_weak',
             'weak',
             update=True)

dp.read_file(cdr_path + 'ner_CDR_TrainingSet.PubTator.txt',
             'cdr_train',
             'cdr',
             update=True)
dp.read_file(cdr_path + 'ner_CDR_DevelopmentSet.PubTator.txt', 'cdr_dev',
             'cdr')
dp.read_file(cdr_path + 'ner_CDR_TestSet.PubTator.txt', 'cdr_test', 'cdr')
Esempio n. 2
0
# -*- coding: utf-8 -*-

import redis
from flask import Flask, jsonify

import config
from process import DataProcessor

app = Flask(__name__)
dp = DataProcessor()

client = redis.StrictRedis(host=config.redis_host,
                           port=config.redis_port,
                           decode_responses=True,
                           charset='utf-8')


@app.route('/')
def index():
    return 'OK'


@app.route('/<protocol>/random')
def random(protocol):
    """
    随机返回一个优质代理IP
    :param protocol:
    :return:
    """
    return client.srandmember('{}:proxies:{}'.format(protocol, 1))
Esempio n. 3
0
from utils import get_sidereal_time
from process import open_fits, flatten_max, DataProcessor
import dateutil.parser
import os, shutil

dp = DataProcessor()
dp.outdir = 'test/out'
dp.verbose = 1

st = '0429'
path_end = os.path.join(st[0:2], st[2:4])
path = os.path.join('sid', path_end)
night = os.listdir(path)

dp.do_total = True
dp.indir = 'sid'
dp.do_filter = False
dp.do_output = False
dp.process_night(path, night)

from django.template import Context, Template
t = Template(open(os.path.join('clouds','templates','clouds','image.html')).read())

from catlib import parse_cat

point_list = map(lambda (i,row):row, parse_cat(os.path.join('test', 'out', 'cat', path, 'total.cat')).iterrows())
print len(point_list)
with open(os.path.join('test',st+'.html'), 'w') as out:
    out.write(t.render(Context({'point_list': point_list,
                                'point_pk': -1,
                                'object': {'get_url': path+'/total' }
Esempio n. 4
0
"""
Usage:
    parser_cli.py [options] INPUT_FILEPATH

Options:
    -h --help
    --language LANGUAGE             Language
"""
import json

from docopt import docopt
from tree_sitter import Language

from language_data import LANGUAGE_METADATA
from process import DataProcessor

if __name__ == '__main__':
    args = docopt(__doc__)

    DataProcessor.PARSER.set_language(
        Language('/src/build/py-tree-sitter-languages.so', args['--language']))
    processor = DataProcessor(language=args['--language'],
                              language_parser=LANGUAGE_METADATA[
                                  args['--language']]['language_parser'])

    functions = processor.process_single_file(args['INPUT_FILEPATH'])
    print(json.dumps(functions, indent=2))
Esempio n. 5
0
def process(target):

    DataProcessor.PARSER.set_language(Language('/src/build/py-tree-sitter-languages.so', sys.argv[1]))
    processor = DataProcessor(
        language=sys.argv[1],
        language_parser=LANGUAGE_METADATA[sys.argv[1]]['language_parser']
    )
    
    results = []

    if target['language'] == 'java':
        try:
            javalang.parse.parse(target['the_code'])
        except Exception as ex:
            if sys.argv[2] != 'gz':
                print('Failed to validate: ' + target['from_file'])
                print(target['the_code'])
                print(ex)
            return False, []
    elif target['language'] == 'python':
        try:
            parser = driver.Driver(pygram.python_grammar, convert=pytree.convert)
            parser.parse_string(target['the_code'].strip() + '\n')
            ast.parse(target['the_code'])
        except Exception:
            if sys.argv[2] != 'gz':
                print('Failed to validate: ' + target['from_file'])
            return False, []

    functions = processor.process_blob(target['the_code'])
        
    for function in functions:
        sha256 = hashlib.sha256(
            function["function"].strip().encode('utf-8')
        ).hexdigest()

        if target['language'] == 'java':
            if JAVA_REJECT_REGEX.search(function["function"]):
                continue
            if sha256 in BANNED_JAVA_SHAS:
                # print("  - Skipped '{}'".format(sha256))
                continue # Spoon transformer chokes on these, so exclude
        elif target['language'] == 'python':
            if PY_REJECT_REGEX.search(function["function"]):
                continue
            if sha256 in BANNED_PY_SHAS:
                # print("  - Skipped '{}'".format(sha256))
                continue # Spoon transformer chokes on these, so exclude

        tokens_pre, tokens_post = ([], [])

        try:
            tokens_pre, tokens_post = remove_func_name(
                function["identifier"].split('.')[-1],
                function["function_tokens"]
            )
        except:
            continue
    
        results.append({
            "language": function["language"],
            "identifier": function["identifier"].split('.')[-1],
            "target_tokens": subtokenize(function["identifier"].split('.')[-1]),
            "source_tokens": tokens_post,
            "elided_tokens": tokens_pre,
            "source_code": function["function"] if function["language"] != "java" else (
                'class WRAPPER {\n' + function["function"] + '\n}\n'
            ),
            "sha256_hash": sha256,
            "split": target['split'],
            "from_file": target['from_file']
        })
    
    return True, results
Esempio n. 6
0
    definitions = defaultdict(list)
    with open(args['DEFINITION_FILE'], 'rb') as f:
        for d in pickle.load(f)
            definitions[d['nwo']].append(d)
    definitions = dict(definitions)

    # Fill candidates from most depended libraries
    c = Counter(dees)
    library_candidates = {}
    for nwo, _ in c.most_common(len(c)):
        if nwo.split('/')[-1] not in library_candidates and nwo in definitions:
            # Approximate library name with the repository name from nwo
            library_candidates[nwo.split('/')[-1]] = definitions[nwo]

    DataProcessor.PARSER.set_language(Language(args['--tree-sitter-build'], args['--language']))
    processor = DataProcessor(language=args['--language'],
                              language_parser=LANGUAGE_METADATA[args['--language']]['language_parser'])

    with Pool(processes=int(args['--processes'])) as pool:
        output = pool.imap_unordered(functools.partial(processor.process_dent,
                                                       ext=LANGUAGE_METADATA[args['--language']]['ext']),
                                     dents)

    dent_definitions, edges = map(list, map(flatten, zip(*output)))

    with gzip.GzipFile(args['OUTPUT_DIR'] + '{}_dent_definitions.pkl.gz'.format(args['--language']), 'wb') as outfile:
        pickle.dump(dent_definitions, outfile)
    with gzip.GzipFile(args['OUTPUT_DIR'] + '{}_edges.pkl.gz'.format(args['--language']), 'wb') as outfile:
        pickle.dump(edges, outfile)
Esempio n. 7
0
hidden_dropout = 0.75
input_dropout = 0.5
middle_droupout = 1.0
word_droupout = 0.75

clip_norm = 5

batch_size = 32

################
# process data #
################

embeddings_file = '/home/nathan/Programming/research/data/embeddings/glove.6B/glove.6B.100d.txt'

dp = DataProcessor(vocab=embeddings_file)

dp.read_file(
    '/home/nathan/Programming/research/data/cdr/ner_CDR_train.txt',
    '/home/nathan/Programming/research/sandbox/protos/cdr_train.proto',
    'cdr',
    update=True)

# dp.read_file('/home/nathan/Programming/research/data/cdr/ner_CDR_test.txt',
#              '/home/nathan/Programming/research/sandbox/protos/cdr_test.proto',
#              'cdr')

# dp.read_file('/home/nathan/Programming/research/data/cdr/ner_CDR_dev.txt',
#              '/home/nathan/Programming/research/sandbox/protos/cdr_dev.proto',
#              'cdr')
Esempio n. 8
0
from utils import get_sidereal_time
from process import open_fits, flatten_max, DataProcessor
import dateutil.parser
import os, shutil

dp = DataProcessor()
dp.outdir = 'test/out'
dp.verbose = 1

#date_obs = '2011-05-25T06:00:10'
date_obs = '2012-02-29T10:37:12'

name = date_obs + '.fits'
path = os.path.join('sym', name[0:4], name[5:7], name[8:10])

dp.process_file(os.path.join(path, name))

"""
dt = dateutil.parser.parse(name.split('.')[0])
s = get_sidereal_time(dt).seconds
path_end = os.path.join(*[ unicode(x).zfill(2) for x in [ s/3600, (s/60)%60 ] ])
fname = os.path.join('out', 'fits', 'sid', path_end, 'total.fits')
tdata = open_fits(fname)
night = os.listdir(os.path.join('sid', path_end))

for i in [100, 250, 500, 1000, 3000, 4000, 5000, 2000]:
    dp.output('total', tdata, image_filter=flatten_max(i*len(night)))
    shutil.copyfile(os.path.join('test','out','png','total.png'),
            os.path.join('test', 'total{0}.png').format(i))
"""
Esempio n. 9
0
set_labels = {
    'A': [
        'T005', 'T007', 'T037', 'T038', 'T058', 'T074', 'T092', 'T098', 'T168',
        'T170'
    ],
    'B':
    ['T017', 'T031', 'T062', 'T082', 'T091', 'T097', 'T103', 'T201', 'T204'],
    'full': [
        'T005', 'T007', 'T037', 'T038', 'T058', 'T074', 'T092', 'T098', 'T168',
        'T170', 'T017', 'T031', 'T062', 'T082', 'T091', 'T097', 'T103', 'T201',
        'T204'
    ]
}

dp = DataProcessor(set_labels=set_labels,
                   vocab=embeddings_file,
                   window_size=window_size)

dp.read_file(path + 'train_split_A_modified', 'A_train', 'A', update=True)
dp.read_file(path + 'train_split_B_modified', 'B_train', 'B', update=True)

dp.read_file(path + 'ner_dev', 'dev', 'full')
dp.read_file(path + 'ner_test', 'test', 'full')

###############
# build model #
###############

vocab_size = len(dp.token_map)
labels_size = len(dp.label_map)
shape_domain_size = len(dp.shape_map)