Ejemplo n.º 1
0
def generate_scan_image(subset):
  list_dirs = os.walk(TRUNK_DIR + subset)
  jsobjs = []
  output_dir = SAMPLE_DIR + subset
  mkdir(output_dir)
  for root, dirs, files in list_dirs:
    for f in files:
      if f.lower().endswith('mhd'):
        key = os.path.splitext(f)[0]
        numpyImage, numpyOrigin, numpySpacing = (
          util.load_itk_image(
            os.path.join(root, f)))
        for z in range(numpyImage.shape[0]):
          patch = numpyImage[z, 0:512, 0:512]
          patch = util.normalizePlanes(patch)
          im = Image.fromarray(patch * 255).convert('L')
          output_filename = (
            subset + "-" + key + "-" + str(z) + "-scan.bmp")
          print(subset + '/' + output_filename)
          im.save(os.path.join(
            output_dir, output_filename))
          jsobjs.append({
                "image_path": subset + '/' + output_filename,
                "rects":[]
              }
            )
  with open(META_DIR + subset + '-scan.json', 'w') as f:
    json.dump(jsobjs, f)
Ejemplo n.º 2
0
 def make_log_file(self, base_directory, filename):
     util.mkdir(base_directory)
     f = open(base_directory + filename, 'w')
     for k, v in self.__dict__.iteritems():
         line = str(k) + " : " + str(v) + "\n"
         f.write(line)
     f.close()
Ejemplo n.º 3
0
def prepare():
    args = read_params(sys.argv)
    conf = ConfigParser()
    conf.read(args['config'])
    args['wdir'] = conf.get('param', 'work_dir')
    args['rdir'] = conf.get('param', 'raw_dir_name')
    args['rdir'] = '%s/%s' % (args['wdir'], args['rdir'])
    mkdir('%s/temp' % args['rdir'])
    return args
Ejemplo n.º 4
0
def generate_data(data_root, data_map, fp_dir):
  list_dirs = os.walk(data_root)
  index = 0
  for i in range(10):
    util.mkdir(FP_DIR + 'subset' + str(i))
  meta = dict([('subset' + str(i), []) for i in range(10)])
  for root, dirs, files in list_dirs:
    for f in files:
      if f.lower().endswith("mhd"):
        print(f)
        key = os.path.splitext(f)[0]
        subset = root.split("/")[-1]
        if key in data_map:
          numpyImage, numpyOrigin, numpySpacing = util.load_itk_image(os.path.join(root, f))
          for it in data_map[key]:
            worldCoord, label = it
            voxelCoord = util.worldToVoxelCoord(worldCoord, numpyOrigin, numpySpacing)
            voxelWidth = 65
            x = int(voxelCoord[1])
            y = int(voxelCoord[2])
            z = int(voxelCoord[0])
            patch = numpyImage[z, x - voxelWidth // 2:x + voxelWidth // 2,
                                  y - voxelWidth // 2:y + voxelWidth // 2]
            patch = util.normalizePlanes(patch)
            if patch.size == 0:
              continue

            fpath = os.path.join(fp_dir, subset + '/patch_' + str(index)  + '.bmp')
            Image.fromarray(patch * 255).convert('L').save(fpath)
            meta[subset].append((fpath, label))
            index += 1

            if label == 1:
              for i in range(50):
                dx, dy = MOV_LIST[i % 8]
                xx = x + int(dx * np.random.rand())
                yy = y + int(dy * np.random.rand())
                aug_patch = numpyImage[z, xx - voxelWidth // 2:xx + voxelWidth // 2,
                                       yy - voxelWidth // 2:yy + voxelWidth // 2]
                aug_patch = util.normalizePlanes(aug_patch)
                if aug_patch.size == 0:
                  continue
                fpath = os.path.join(fp_dir, subset + '/patch_' + str(index)  + '.bmp')
                Image.fromarray(aug_patch * 255).convert('L').save(fpath)
                meta[subset].append((fpath, label))
                index += 1
  with open(META_DIR + 'fp.json', 'w') as f:
    json.dump(meta, f)
Ejemplo n.º 5
0
def main():

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # DEFINE INPUTS AND LOAD DATA

    today_dt = datetime.date.today()
    yesterday_dt = today_dt - datetime.timedelta(days=1)
    dates_ds = pd.date_range(inputs.start_date, inputs.end_date)
    dates = [str(x.date()).replace('-', '/') for x in list(dates_ds)]
    overwrite = inputs.overwrite
    site = 'guardian'

    src = os.path.join(utilities.blm_dir, 'Google_CSE_Results',
                       site + '_articles.pkl')
    with open(src, 'rb') as f:
        dates_articles_ = pickle.load(f)
    interim_dir = os.path.join(utilities.blm_dir, 'z_Interim')
    utilities.mkdir(interim_dir)

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    print('Pull comment URLs')

    if overwrite:
        base_url = 'https://www.theguardian.com/discussion'
        dates_articles = copy.deepcopy(dates_articles_)
        counter = 0
        for date, days_articles in dates_articles.items():
            for ix, article in enumerate(days_articles):
                try:
                    article_url = article['url'].strip().lower()
                    r = requests.get(article_url)
                    article_soup = bs(r.text)
                    comments_div = article_soup.find('div', {'id': 'comments'})
                    soup_id = comments_div.attrs['data-discussion-key']

                    comments_url = base_url + soup_id
                    dates_articles[date][ix]['comments_url'] = comments_url
                except:
                    dates_articles[date][ix]['comments_url'] = 'no comments'

        dates_articles_dst = os.path.join(interim_dir,
                                          'articles_w_comments_urls.pkl')
        with open(dates_articles_dst, 'wb') as f:
            pickle.dump(dates_articles, f)
    else:
        with open(dates_articles_dst, 'rb') as f:
            dates_articles = pickle.load(f)

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    print('Remove articles with no comments')

    dates_articles2 = {}
    for date, days_articles in dates_articles.items():
        articles = []
        for article in days_articles:
            if article['comments_url'] != 'no comments':
                article_copy = copy.deepcopy(article)
                articles.append(article_copy)
        if len(articles) > 0:
            dates_articles2[date] = articles

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    print('Scrape comments pages')

    if overwrite:
        dates_articles3 = {}
        counter = 0
        file_counter = 0
        for date, days_articles in dates_articles2.items():
            articles3 = []
            for article in days_articles:
                comments_url = article['comments_url']
                comments_li = []

                try:
                    comments_soup, comments = get_page_comments(comments_url)
                    comments_li.append(comments)
                    next_page_comments_url = get_next_page_url(comments_soup)

                    while next_page_comments_url is not None:
                        try:
                            next_page_comments_soup, next_page_comments = get_page_comments(
                                next_page_comments_url)
                            comments_li.append(next_page_comments)
                            next_page_comments_url = get_next_page_url(
                                next_page_comments_soup)
                        except:
                            next_page_comments_url = None
                except:
                    pass
                article3 = copy.deepcopy(article)
                article3['raw_comments'] = comments_li
                articles3.append(article3)
            if len(articles3) > 0:
                dates_articles3[date] = articles3

            counter += 1
            if counter >= 10:
                dst = os.path.join(utilities.blm_html_1pass_dir,
                                   site + str(file_counter) + '.pkl')
                with open(dst, 'wb') as f:
                    pickle.dump(dates_articles3, f)
                dates_articles3 = {}
                counter = 0
                file_counter += 1

        if counter > 0:
            dst = os.path.join(utilities.blm_html_1pass_dir,
                               site + str(file_counter + 1) + '.pkl')
            with open(dst, 'wb') as f:
                pickle.dump(dates_articles3, f)

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    print('Print outputs')

    recovered_articles_srcs = [
        x for x in utilities.get_files(utilities.blm_html_1pass_dir)
        if site in x
    ]
    recovered_articles = utilities.combine_dicts(recovered_articles_srcs)
    n_articles = utilities.count_articles(recovered_articles)
    print('Recovered %s on-topic articles with comments' % n_articles)
Ejemplo n.º 6
0
Archivo: fp.py Proyecto: xhongz/cancer
def train(args):
    with open(args.hype) as f:
        H = json.load(f)
        H['subset'] = args.subset
        H['save_dir'] = FPR_DIR + 'subset' + str(H['subset'])
        mkdir(H['save_dir'])
        if args.gpu != None:
            H['gpu'] = args.gpu
    with open(META_DIR + 'fp.json') as fpj:
        meta = json.load(fpj)

    dat = {}
    dat['train'] = []
    dat['valid'] = []
    for i in range(10):
        if i == args.subset:
            dat['valid'] = meta['subset' + str(i)]
        else:
            dat['train'] += meta['subset' + str(i)]
    tf.set_random_seed(2012310818)

    os.environ['CUDA_VISIBLE_DEVICES'] = str(H['gpu'])
    gpu_options = tf.GPUOptions()
    gpu_options.allow_growth = True
    config = tf.ConfigProto(gpu_options=gpu_options)

    with tf.Session(config=config) as sess:
        (x, y, training, Xt, Yt, Xv, Yv, logits, loss, preds, opt, varst,
         gstep, train_opt, saver, fptrunk) = build(H, dat, sess)

        sess.run(tf.global_variables_initializer())
        fptrunk.start()
        if args.weight != None:
            logging.info('Restoring from %s...' % args.weight)
            saver.restore(sess, weight)
        bsize = fptrunk.bsize
        train_batches = fptrunk.nbatches['train']
        valid_batches = fptrunk.nbatches['valid']
        for epoch in range(H['epochs']):
            tst = time.time()
            tol_loss, tol_tfn, tol_tfp, tol_vfn, N, P, vN, vP, tol_vfp, tol_acc, tol_vacc = [
                0.0
            ] * 11
            for step in range(1, train_batches):
                curX, curY = sess.run([Xt, Yt])
                _, tloss, tpreds = sess.run([train_opt, loss, preds],
                                            feed_dict={
                                                Xt: curX,
                                                Yt: curY,
                                                training: True
                                            })
                fn, fp = FPFN(curY, tpreds)
                N += np.sum(curY == 0)
                P += np.sum(curY == 1)
                tol_loss += tloss
                tol_tfn += fn
                tol_tfp += fp
                tol_acc += fp + fn
                if step % 100 == 0:
                    cnt = (step * bsize)
                    logstr = (
                        'Training batchs %d, avg loss %f, acc %f, FN %d/%d, FP %d/%d.'
                        % (step, tol_loss / step, (cnt - tol_acc) / cnt,
                           tol_tfn, P - tol_tfn, tol_tfp, N - tol_tfp))
                    print(logstr)
                    logging.info(logstr)

            for step in range(valid_batches):
                curX, curY = sess.run([Xv, Yv])
                curY = curY.reshape(bsize, 1)
                tpreds = sess.run(preds,
                                  feed_dict={
                                      Xt: curX,
                                      Yt: curY,
                                      training: False
                                  })
                fn, fp = FPFN(curY, tpreds)
                vN += np.sum(curY == 0)
                vP += np.sum(curY == 1)
                tol_vfn += fn
                tol_vfp += fp
                tol_vacc += fn + fp

            t = time.time() - tst
            logstr = ('epoch %d, time elapse %f, training loss %f,' +
                      ' valid avg FN %f, FP %f, acc %f.') % (
                          epoch + 1, t,
                          tol_loss / train_batches, float(tol_vfn) / vP,
                          float(tol_vfp) / vN, tol_vacc / valid_batches)

            print(logstr)
            logging.info(logstr)
            saver.save(sess, H['save_dir'] + '/save.ckpt', global_step=gstep)

        logging.info('training finished, try ending...')
        fptrunk.stop()
        logging.info('ended...')
        sess.close()
Ejemplo n.º 7
0
import utilities
from utilities import preview
import os
import pandas as pd
import copy
import pickle
import keyring
import getpass
from googleapiclient.discovery import build

dates_ds = pd.date_range(inputs.start_date, inputs.end_date)
dates = [str(x.date()).replace('-', '/') for x in list(dates_ds)]
overwrite = inputs.overwrite
site = 'guardian'
dst_dir = os.path.join(utilities.blm_dir, 'Google_CSE_Results')
utilities.mkdir(dst_dir)
res_dst = os.path.join(dst_dir, site + '_res_li.pkl')


def main():
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    print(
        'Use google custom search api to retrieve on-topic articles for selected site'
    )
    # indicate the site on the google custom search api control panel

    # enter credentials from config.py file
    # if this file doesn't exist, create one and define google_custom_search_cx and developerKey variables
    cx = config.google_custom_search_cx
    developerKey = config.google_custom_search_developer_key
    service = build("customsearch", "v1", developerKey=developerKey)
        if (file_format == 'json') & (write_mode == 'w'):
            json.dump(d, f)
        elif (file_format == 'pkl') & (write_mode == 'wb'):
            pickle.dump(d, f)
        else:
            raise ValueError(
                "File format or write mode incorrect.\nOptions are 1) 'json', 'w' and 2) 'pkl', 'wb'"
            )


# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# DEFINE INPUTS

site = 'breitbart'
dst_dir = os.path.join(utilities.blm_processed_parsed_dir, '2nd_iteration')
utilities.mkdir(dst_dir)  # make dst dir if it doesn't exist


def main():

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    print('Parse comments')

    # combine scraped blm data
    blm_srcs = utilities.get_files(
        utilities.blm_html_1pass_dir) + utilities.get_files(
            utilities.blm_html_2pass_dir)
    blm_srcs = [x for x in blm_srcs if site in x]
    blm = utilities.combine_dicts(blm_srcs)

    # sort by date
Ejemplo n.º 9
0
def main():
    savepath = 'train2_img/'
    utilities.mkdir(savepath)
    lmdb_file = 'train2'
    read_lmdb(lmdb_file, savepath)
Ejemplo n.º 10
0
    parser.add_argument('--test', dest='test', metavar='method',type=str,default="wilcox",
                        help="set method wilcox,t,default[wilcox]")
    parser.set_defaults(paired=False)
    args = parser.parse_args()
    params = vars(args)
    params['paired'] = judge(params['paired'])
    return params

if __name__ == '__main__':
    params = read_params(sys.argv)
    bin_defdir = '%s/02.taxon' % const.bin_defdir
    out_dir = params["out_dir"]
    profile_table = params["profile_table"]
    group_file = params["group_file"]
    use_mothed = params["test"]
    mkdir(out_dir)
    env = Environment(loader=FileSystemLoader(bin_defdir),autoescape=False)
    template = env.get_template("g11_diff.R")
    Rtext = template.render(tool_default_dir = const.tool_defdir,\
                            profile_table=profile_table,\
                            group_file=group_file,\
                            out_dir = out_dir,\
                            mothed=use_mothed,\
                            p_cutoff=params["cutoff"],\
                            fdr=params["fdr"],\
                            paired = params['paired'])
    with open("%s/diff.R" % out_dir,"w") as fqw:
        fqw.write(Rtext)
    Rrun("%s/diff.R" % out_dir)

Ejemplo n.º 11
0
                        del draw
                        im.save(os.path.join(pimg, img_name[:-4] + '_gt.bmp'))
                        cur = {
                            "image_path": subset + '/' + img_name,
                            "rects": [{
                                "x1": x1,
                                "x2": x2,
                                "y1": y1,
                                "y2": y2,
                            }]
                        }
                        samples[subset].append(cur)

    for key in samples:
        with open(META_DIR + key + '.json', 'w') as f:
            json.dump(samples[key], f)


if __name__ == '__main__':
    dirs = [
        SAMPLE_DIR,
        META_DIR,
        FPR_DIR,
    ] + [SAMPLE_DIR + 'subset' + str(i) for i in range(10)]
    for d in dirs:
        util.mkdir(d)
    data_map = util.readImageMap(ANNOTATION_CSV)
    generate_data(TRUNK_DIR, data_map, SAMPLE_DIR)

# generate_scan_image(DATA_ROOT, OUTPUT_DIR)
Ejemplo n.º 12
0
def eliminate(args):
    with open(args.hype) as f:
        H = json.load(f)
        H['subset'] = args.subset
        if args.gpu != None:
            H['gpu'] = args.gpu
        H['epoch'] = args.epoch
        H['weights'] = args.weights
        H['fpepoch'] = args.fpepoch
        H['save_dir'] = 'data/output.eliminate/' + 'subset' + str(H['subset'])
        mkdir(H['save_dir'])

    os.environ['CUDA_VISIBLE_DEVICES'] = str(H['gpu'])
    gpu_options = tf.GPUOptions()
    gpu_options.allow_growth = True
    config = tf.ConfigProto(gpu_options=gpu_options)
    tf.set_random_seed(2012310818)

    with tf.Session(config=config) as sess:
        xv = tf.placeholder(tf.float32, shape=[1, 64, 64, 1])
        logits, pred = model(H, xv, training=True)

        saver = tf.train.Saver(max_to_keep=None)
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, H['weights'])
        voxelW = 65
        SUBSET_DIR = DETECT_DIR + 'subset' + str(H['subset']) + '/'

        with open(SUBSET_DIR + 'result_' + str(H['epoch']) + '.json') as f:
            detects = json.load(f)
            i = 0
            for it in detects:
                boxes = it['box']
                if len(boxes) > 0:
                    img = cv2.imread(SAMPLE_DIR + it['file'],
                                     0).astype(np.float32)
                    rboxes = []
                    for box in boxes:
                        x, y = int((box[0] + box[2]) / 2), int(
                            (box[1] + box[3]) / 2)
                        if x - voxelW // 2 < 0:
                            x = 0
                        if x + voxelW // 2 >= img.shape[1]:
                            x = img.shape[1] - voxelW
                        if y - voxelW // 2 < 0:
                            y = 0
                        if y + voxelW // 2 >= img.shape[0]:
                            y = img.shape[0] - voxelW
                        patch = img[y:y + voxelW - 1, x:x + voxelW - 1]

                        y_, logits_ = sess.run(
                            [pred, logits],
                            feed_dict={xv: patch.reshape(1, 64, 64, 1)})
                        if y_ == 1:
                            rboxes.append(box)
                            cv2.imwrite(str(i) + '.bmp', patch)
                            i = i + 1
                            print(logits_, y_)
                    it['box'] = rboxes

        generate_result(TRUNK_DIR, detects,
                        SUBSET_DIR + str(H['epoch']) + '.csv', 0.1)