Example #1
0
    def parse_item(self, response):
        hxs = Selector(response)
        item_titles = extract(hxs, "//div[@class='gl-i-wrap j-sku-item']//a/em/text()")
        top_id = extract_one(hxs, '//*[@id="J_crumbsBar"]/div/div/div/div[1]/a/text()')
        type_id1 = extract(hxs, '//*[@id="J_crumbsBar"]//div[@class="trigger"]/span/text()')[0]
        type_id2 = extract(hxs, '//*[@id="J_crumbsBar"]//div[@class="trigger"]/span/text()')[-1]

        if type_id1 != type_id2:
            for i, t in enumerate(item_titles):
                if i < 20:
                    good = {
                        'mall': '2',
                        'rank': str(i + 1),
                        'title': t,
                        'price': '0',
                        'turnover_index': '0',
                        'top_id': top_id,
                        'type_id1': type_id1,
                        'type_id2': type_id2,
                        'url': response.url
                    }

                    yield Good(good)

        for link in self.normal_url_extractor.extract_links(response):
            yield SplashRequest(link.url, callback=self.parse_url, args={'wait': 0.5, 'html': 1, })

        for link in self.needed_url_extractor.extract_links(response):
            if 'ev' not in link.url:
                url = re.sub(r'page=.*&', 'page=1&', link.url)
                url = re.sub(r'stock=.*&', 'stock=0&', url)
                url = re.sub(r'delivery_daofu=.*&', 'delivery_daofu=0&', url)
                url = re.sub(r'delivery=.*&', 'delivery=0&', url)
                yield SplashRequest(url, callback=self.parse_item, args={'wait': 0.5, 'html': 1, })
Example #2
0
    def parse_item(self, response):
        hxs = Selector(response)
        top_id = re.findall(r'.*&topId=(\S+_\S+)&type.*', response.url)[0]
        #        type_id=re.findall(r'.*leafId=(\d+)&rank=.*',response.url)[0]
        type_id1 = extract_one(
            hxs,
            "//div[@class='block-body ']/div[@class='params-cont']/a[@class='param-item icon-tag param-item-selected']/text()"
        )
        ranks_tuple = extract(
            hxs,
            '//*[@class="rank-num rank-focus"]/text()|//*[@class="rank-num rank-important"]/text()|//*[@class="rank-num rank-"]/text()'
        )
        ranks = []
        for r in ranks_tuple:
            if r.strip() != '':
                ranks.append(r)

        titles = extract(hxs, '//*[@class="title"]/a/text()')
        prices = extract(hxs, '//*[@class="col3 col"]/text()')[1:]
        turnover_indexs = extract(hxs, '//*[@class="focus-bar"]/span/text()')

        for r, t, p, i in zip(ranks, titles, prices, turnover_indexs):
            good = {
                'mall': '0',
                'rank': r.strip(),
                'title': t.strip(),
                'price': p.split('¥')[-1].strip(),
                'turnover_index': i.strip(),
                'top_id': top_id.strip(),
                'type_id1': type_id1.strip(),
                'type_id2': '',
                'url': response.url
            }
            yield Good(good)
Example #3
0
    def parse_item(self,response):
        hxs=Selector(response)
        top_id=re.findall(r'.*&topId=(\S+_\S+)&type.*',response.url)[0]
#        type_id=re.findall(r'.*leafId=(\d+)&rank=.*',response.url)[0]
        type_id1=extract_one(hxs,"//div[@class='block-body ']/div[@class='params-cont']/a[@class='param-item icon-tag param-item-selected']/text()")
        ranks_tuple=extract(hxs,'//*[@class="rank-num rank-focus"]/text()|//*[@class="rank-num rank-important"]/text()|//*[@class="rank-num rank-"]/text()')
        ranks=[]
        for r in ranks_tuple:
            if r.strip()!='':
                ranks.append(r)

        titles=extract(hxs,'//*[@class="title"]/a/text()')
        prices=extract(hxs,'//*[@class="col3 col"]/text()')[1:]
        turnover_indexs=extract(hxs,'//*[@class="focus-bar"]/span/text()')

        for r,t,p,i in zip(ranks,titles,prices,turnover_indexs):
            good={
                'mall':'0',
                'rank':r.strip(),
                'title':t.strip(),
                'price':p.split('¥')[-1].strip(),
                'turnover_index':i.strip(),
                'top_id':top_id.strip(),
                'type_id1':type_id1.strip(),
                'type_id2':'',
                'url':response.url
            }
            yield Good(good)
Example #4
0
def load_lists(opt):
    arch_name = 'ava_v{}.zip'.format(opt.version)
    arch_path = os.path.join(opt.out_path, arch_name)
    arch_url = 'https://research.google.com/ava/download/{}'.format(arch_name)
    if utils.download_file(arch_url, arch_path):
        utils.extract(arch_path, opt.out_path)

    train_video_ids, val_video_ids, test_video_ids = None, None, None
    if opt.type is None or opt.type == 'train':
        ids_file_path = os.path.join(opt.out_path,
                                     'ava_train_v{}.csv'.format(opt.version))
        train_video_ids = read_ids(ids_file_path)
    if opt.type is None or opt.type == 'validation':
        ids_file_path = os.path.join(opt.out_path,
                                     'ava_val_v{}.csv'.format(opt.version))
        val_video_ids = read_ids(ids_file_path)
    if opt.type is None or opt.type == 'test':
        ids_file_path = os.path.join(opt.out_path,
                                     'ava_test_v{}.txt'.format(opt.version))
        test_video_ids = read_ids(ids_file_path)

    ts_file_name = 'ava_included_timestamps_v{}.txt'.format(opt.version)
    ts_file_path = os.path.join(opt.out_path, ts_file_name)
    with open(ts_file_path) as f:
        lines = f.readlines()
        timestamps = int(lines[0]), int(lines[-1])

    return train_video_ids, val_video_ids, test_video_ids, timestamps
Example #5
0
    def parse_item(self, response):
        hxs = Selector(response)
        item_titles = extract(hxs, "//div[@class='gl-i-wrap j-sku-item']//a/em/text()")
        top_id = extract_one(hxs, '//*[@id="J_crumbsBar"]/div/div/div/div[1]/a/text()')
        type_id1 = extract(hxs, '//*[@id="J_crumbsBar"]//div[@class="trigger"]/span/text()')[0]
        type_id2 = extract(hxs, '//*[@id="J_crumbsBar"]//div[@class="trigger"]/span/text()')[-1]

        if type_id1 != type_id2:
            for i, t in enumerate(item_titles):
                if i < 20:
                    good = {
                        "mall": "2",
                        "rank": str(i + 1),
                        "title": t,
                        "price": "0",
                        "turnover_index": "0",
                        "top_id": top_id,
                        "type_id1": type_id1,
                        "type_id2": type_id2,
                        "url": response.url,
                    }

                    yield Good(good)

        for link in self.normal_url_extractor.extract_links(response):
            yield SplashRequest(link.url, callback=self.parse_url, args={"wait": 0.5, "html": 1})

        for link in self.needed_url_extractor.extract_links(response):
            if "ev" not in link.url:
                url = re.sub(r"page=.*&", "page=1&", link.url)
                url = re.sub(r"stock=.*&", "stock=0&", url)
                url = re.sub(r"delivery_daofu=.*&", "delivery_daofu=0&", url)
                url = re.sub(r"delivery=.*&", "delivery=0&", url)
                yield SplashRequest(url, callback=self.parse_item, args={"wait": 0.5, "html": 1})
def preprocess(no_wells_marmousi, no_wells_seam):
    """Function initializes data, performs standardization, and train test split
    
    Parameters:
    ----------
    no_wells_marmousi : int,
        number of evenly spaced wells and seismic samples to be evenly sampled 
        from marmousi section.
        
    no_wells_seam : int
        number of evenly spaced wells and seismic samples to be evenly sampled from SEAM
        
    Returns
    -------
    seismic_marmousi : array_like, shape(num_traces, depth samples)
        2-D array containing seismic section for marmousi
        
    seismic_seam : array_like, shape(num_traces, depth samples)
        2-D array containing seismic section for SEAM
        
    model_marmousi : array_like, shape(num_wells, depth samples)
        2-D array containing model section from marmousi 2
        
    model_seam : array_like, shape(num_wells, depth samples)
        2-D array containing model section from SEAM
    
    """

    # get project root directory
    project_root = os.getcwd()

    if ~os.path.isdir(
            'data'):  # if data directory does not exists then extract
        extract('data.zip', project_root)

    # Load data
    seismic_marmousi = np.load(join(
        'data', 'marmousi_synthetic_seismic.npy')).squeeze()
    seismic_seam = np.load(join('data',
                                'poststack_seam_seismic.npy')).squeeze()[:,
                                                                         50:]
    seismic_seam = seismic_seam[::2, :]

    # Load targets and standardize data
    model_marmousi = np.load(join('data',
                                  'marmousi_Ip_model.npy')).squeeze()[::5, ::4]
    model_seam = np.load(join('data',
                              'seam_elastic_model.npy'))[::3, :, ::2][:, :,
                                                                      50:]
    model_seam = model_seam[:, 0, :] * model_seam[:, 2, :]

    # standardize
    seismic_marmousi, model_marmousi = standardize(seismic_marmousi,
                                                   model_marmousi,
                                                   no_wells_marmousi)
    seismic_seam, model_seam = standardize(seismic_seam, model_seam,
                                           no_wells_seam)

    return seismic_marmousi, seismic_seam, model_marmousi, model_seam
Example #7
0
def install():
    fetch("http://ftp.gnome.org/pub/gnome/sources/json-glib/0.16/json-glib-%(json-glib)s.tar.xz")
    extract("json-glib-%(json-glib)s.tar.xz")
    configure(
        "json-glib-%(json-glib)s", ["--prefix=%s" % env.prefix, "--disable-gcov", "--disable-introspection", "CC=clang"]
    )
    make("json-glib-%(json-glib)s")
    make("json-glib-%(json-glib)s", "install")
Example #8
0
    def test_extract(self):
        #empty output directory
        utils.init_path(self.output_dir)
        utils.extract(self.archive_file, self.output_dir)
        files = os.listdir(self.output_dir)

        with tarfile.open(self.archive_file) as f:
            for file in files:
                assert file in f.getnames()
Example #9
0
    def test_extract(self):
        #empty output directory
        utils.init_path(self.output_dir)
        utils.extract(self.archive_file, self.output_dir)
        files = os.listdir(self.output_dir)

        with tarfile.open(self.archive_file) as f:
            for file in files:
                assert file in f.getnames()
Example #10
0
 async def autoip(self, ctx, *, options: str = None):
     options = options.split(" ") if options else []
     version = extract(options, "46")
     addr_class = extract(options, "abc", func="lower")
     if not version or version != "6":
         func = self.fake.ipv4(address_class=addr_class)
     else:
         func = self.fake.ipv6()
     return await self.send(ctx, "IP address", func)
def download_libs(constants: Constants, settings: {},
                  progress: sg.ProgressBar):
    archive = os.path.join(settings["installdir"], "libraries.zip")
    utils.download_file(constants.win64_dev_libs, archive, 2492854)
    utils.extract(archive, settings["installdir"])

    dlldir = os.path.join(settings["installdir"], "dev64", "bin")
    for lib in os.listdir(dlldir):
        shutil.copyfile(os.path.join(dlldir, lib),
                        os.path.join(settings["installdir"], lib))
Example #12
0
def incorrectly_ordered_boolean(if_ast: dict, code, code_identifier_lst):
    if if_ast["test"]["type"] == "LogicalExpression" and if_ast["test"][
            "operator"] == "&&":
        code_left = utils.extract(if_ast["test"]["left"]["loc"], code)
        code_right = utils.extract(if_ast["test"]["right"]["loc"], code)
        if code_left in code_right:  # TODO similarity
            tmp = if_ast["test"]["left"]
            if_ast["test"]["left"] = if_ast["test"]["right"]
            if_ast["test"]["right"] = tmp
            return True
Example #13
0
 def parse(self, response):
     xxs= scrapy.Selector(response)
     source="baidu_"+extract(xxs.xpath('//channel/title/text()'))
     for xItem in xxs.xpath('//item'):
         item=Article()
         item['source']=source
         item['title']=extract( xItem.xpath('./title/text()'))
         item['link']= extract(xItem.xpath('./link/text()'))
         item['desc']= extract(xItem.xpath('./description/text()'))
         item['pubDate']= extract(xItem.xpath('./pubDate/text()'))
         yield item
Example #14
0
def read_dataframe(out_path, split):
    arch_url = KIN_PARAMS[split]['arch_url']
    arch_path = os.path.join(out_path, KIN_PARAMS[split]['arch_name'])
    csv_name = KIN_PARAMS[split]['csv_name']
    csv_path = os.path.join(out_path, csv_name)
    if utils.download_file(arch_url, arch_path):
        utils.extract(arch_path, out_path, csv_name)
    df = pd.read_csv(csv_path)
    if 'label' in df.columns:
        df['label'] = df['label'].astype('category')
    return df
Example #15
0
 async def autocolor(self, ctx, *, options: str = None):
     options = options.split(" ") if options else []
     color_format = extract(options, COLOR_FORMATS, func="lower", default="hex")
     hue = extract(options, HUES, func="lower")
     luminosity = extract(options, LUMINOSITIES, func="lower", default="random")
     return await self.send(
         ctx,
         f"{color_format} color",
         self.fake.color(
             hue=hue, luminosity=luminosity, color_format=color_format
         )
     )
Example #16
0
def _get_toolchain(tree, root, keep_archive=False, clean=False):
	archive = _create_name(tree, suffix=".tar.bz2")
	tree.insert(0, root)
	dir = path.join(_create_path(tree))
	archive_dir = path.join(dir, archive)
	
	if not check_path(dir, clean):
		if download(TOOLCHAIN_FORGE + archive, archive_dir):
			extract(archive_dir, dir)
			if not keep_archive:
				remove(archive_dir)
	else:
		print "! %s already exists" % dir
Example #17
0
def install():
    fetch('http://www.pell.portland.or.us/~orc/Code/discount/discount-%(discount)s.tar.bz2')
    extract('discount-%(discount)s.tar.bz2')
    configure('discount-%(discount)s', ['--prefix=%s' % env.prefix,
                                        '--libdir=%s/lib' % env.prefix,
                                        '--mandir=%s/man' % env.prefix,
                                        '--shared',
                                        '--enable-all-features'],
              'configure.sh')
    run('sed -i .bkp -e "/ldconfig/d" %s/%s/librarian.sh' %
        (env.build, 'discount-%(discount)s' % env.versions))
    make('discount-%(discount)s')
    make('discount-%(discount)s', 'install')
Example #18
0
def video_info(filepath):
    """Return some video meta information as a dictionary."""
    ffmpeg = subprocess.Popen("ffmpeg -i " + filepath,
                              shell=True,
                              stderr=subprocess.PIPE)
    stdout, stderr = ffmpeg.communicate()
    info = {}
    info['creation'] = extract('creation_time[ ]+: ([0-9-]* [0-9:]*)', stderr,
                               1, timestamp)
    info['duration'] = extract('Duration: ([0-9:\.]*)', stderr, 1)
    info['detected fps'] = extract('([0-9]*.?[0-9]*) fps,', stderr, 1, float)
    info['w'], info['h'] = extract('Stream.*, ([0-9]+)x([0-9]+)', stderr,
                                   (1, 2), lambda (x, y): (int(x), int(y)))
    return info
Example #19
0
 def parse(self, response):
     hxs = Selector(response, type="html")
     item_url_list = extract(hxs, "//div[@class='block-body ']/div[@class='params-cont']/a/@href")
     #        //div[@class='block-body ']/div[@class='params-cont']/a/@href
     for url in item_url_list:
         url = url.replace('./index.php?', 'https://top.taobao.com/index.php?')
         yield SplashRequest(url, callback=self.extract_url, args={'wait': 0.5, 'html': 1})
Example #20
0
    def parse(self,response):
        hxs=Selector(response,type="html")
        item_url_list=extract(hxs,"//div[@class='block-body ']/div[@class='params-cont']/a/@href")
#        //div[@class='block-body ']/div[@class='params-cont']/a/@href
        for url in item_url_list:
            url=url.replace('./index.php?','https://top.taobao.com/index.php?')
            yield SplashRequest(url,callback=self.extract_url,args={'wait':0.5,'html':1})
Example #21
0
    def search(cls, query_params):

        # NOTE: Params 'recursive' and 'with_responses' are currently not used by
        # either the 'search' or 'get_all' actions below.  Both already use
        # with_responses=False internally in the comment service, so no additional
        # optimization is required.
        params = {
            'page': 1,
            'per_page': 20,
            'course_id': query_params['course_id'],
        }
        params.update(utils.strip_blank(utils.strip_none(query_params)))

        if query_params.get('text'):
            url = cls.url(action='search')
        else:
            url = cls.url(action='get_all',
                          params=utils.extract(params, 'commentable_id'))
            if params.get('commentable_id'):
                del params['commentable_id']
        response = utils.perform_request(
            'get',
            url,
            params,
            metric_tags=[u'course_id:{}'.format(query_params['course_id'])],
            metric_action='thread.search',
            paged_results=True)
        if query_params.get('text'):
            search_query = query_params['text']
            course_id = query_params['course_id']
            group_id = query_params[
                'group_id'] if 'group_id' in query_params else None
            requested_page = params['page']
            total_results = response.get('total_results')
            corrected_text = response.get('corrected_text')
            # Record search result metric to allow search quality analysis.
            # course_id is already included in the context for the event tracker
            tracker.emit(
                'edx.forum.searched', {
                    'query': search_query,
                    'corrected_text': corrected_text,
                    'group_id': group_id,
                    'page': requested_page,
                    'total_results': total_results,
                })
            log.info(
                u'forum_text_search query="{search_query}" corrected_text="{corrected_text}" course_id={course_id} group_id={group_id} page={requested_page} total_results={total_results}'
                .format(search_query=search_query,
                        corrected_text=corrected_text,
                        course_id=course_id,
                        group_id=group_id,
                        requested_page=requested_page,
                        total_results=total_results))

        return utils.CommentClientPaginatedResult(
            collection=response.get('collection', []),
            page=response.get('page', 1),
            num_pages=response.get('num_pages', 1),
            thread_count=response.get('thread_count', 0),
            corrected_text=response.get('corrected_text', None))
Example #22
0
def parse_shebang(s: str) -> str:
    """repos
    extract token from shebang like `#!/bin/sh`
    https://en.wikipedia.org/wiki/Shebang_(Unix)
    :param s: shebang
    :return: shebang token
    """
    script = s
    try:
        match = extract(s, REGEX_SHEBANG_FULL)
        script = match.group().split('/')[-1]
        pos = match.end()
        match = extract(s, REGEX_SHEBANG_WHITESPACE, pos=pos)
        pos = match.end()
        match = extract(s, REGEX_SHEBANG_NON_WHITESPACE, pos=pos)
        return extract(match.group(), compile(r'[^\d]+')).group(0)
    except ExtractException as e:
        return script
Example #23
0
    def __init__(self, bug, hash):
        """
        Initialize comments

        :arg hash: Dictionary of comment details
        :arg bug: Instance of :class:`~bz_xmlrpc.classes.Bug` object

        :return: Instance of :class:`Comment`
        .. note::
            No need to use this directly. 
            Use :meth:`~bz_xmlrpc.classes.Bug.get_comments()`
        """
        self._hash = hash
        self.id = extract(hash, 'id', 'comment_id')
        self.author = extract(hash, 'email', 'author')
        self.bug = bug
        self.is_private = bool(extract(hash, 'is_private', 'isprivate'))
        self.text = extract(hash, 'text', 'body')
        self.time = to_datetime(extract(hash, 'time', 'bug_when'))
Example #24
0
    def __init__(self, bug, hash):
        """
        Initialize comments

        :arg hash: Dictionary of comment details
        :arg bug: Instance of :class:`~bz_xmlrpc.classes.Bug` object

        :return: Instance of :class:`Comment`
        .. note::
            No need to use this directly. 
            Use :meth:`~bz_xmlrpc.classes.Bug.get_comments()`
        """
        self._hash = hash
        self.id = extract(hash, "id", "comment_id")
        self.author = extract(hash, "email", "author")
        self.bug = bug
        self.is_private = bool(extract(hash, "is_private", "isprivate"))
        self.text = extract(hash, "text", "body")
        self.time = to_datetime(extract(hash, "time", "bug_when"))
Example #25
0
    def do_scatter(i, j, ax):
        """ Draw single scatter plot
        """
        xs, ys = utils.extract(i, j, steadies)
        ax.scatter(xs, ys)

        ax.set_xlabel(r"$S_%d$" % i)
        ax.set_ylabel(r"$S_%d$" % j)

        cc = utils.get_correlation(xs, ys)
        ax.set_title(r"Corr: $%.2f$" % cc)
Example #26
0
 def __init__(self, hash):
     """
     Initialize
     """
     self._hash = hash
     if isinstance(hash, str):
         # Hack for searched bug groups
         self.name = hash
         self.ison = True
     else:
         self.bit = extract(hash, 'bit', 'id')
         self.name = extract(hash, 'name')
         self.description = extract(hash, 'description')
         self.ingroup = bool(extract(hash, 'ingroup'))
         self.ison = bool(extract(hash, 'ison'))
         self.mandatory = bool(extract(hash, 'mandatory'))
         self.othercontrol = bool(extract(hash, 'othercontrol'))
         self.direct = bool(extract(hash, 'direct'))
         self.isbuggroup = bool(extract(hash, 'isbuggroup'))
         self.userregexp = extract(hash, 'userregexp')
Example #27
0
    def do_scatter(i, j, ax):
        """ Draw single scatter plot
        """
        xs, ys = utils.extract(i, j, steadies)
        ax.scatter(xs, ys)

        ax.set_xlabel(r'$S_%d$' % i)
        ax.set_ylabel(r'$S_%d$' % j)

        cc = utils.get_correlation(xs, ys)
        ax.set_title(r'Corr: $%.2f$' % cc)
Example #28
0
 def __init__(self, hash):
     """
     Initialize
     """
     self._hash = hash
     if isinstance(hash, str):
         # Hack for searched bug groups
         self.name = hash
         self.ison = True
     else:
         self.bit = extract(hash, "bit", "id")
         self.name = extract(hash, "name")
         self.description = extract(hash, "description")
         self.ingroup = bool(extract(hash, "ingroup"))
         self.ison = bool(extract(hash, "ison"))
         self.mandatory = bool(extract(hash, "mandatory"))
         self.othercontrol = bool(extract(hash, "othercontrol"))
         self.direct = bool(extract(hash, "direct"))
         self.isbuggroup = bool(extract(hash, "isbuggroup"))
         self.userregexp = extract(hash, "userregexp")
Example #29
0
    def parse_item(self,response):
        hxs=Selector(response)
        item_titles=extract(hxs,"//div[@id='J_ItemList']//p[@class='productTitle']/a/text()")
        top_id=extract_one(hxs,'//*[@id="J_CrumbSlideCon"]/li[2]/a/text()')
        type_id1=extract(hxs,'//*[@id="J_CrumbSlideCon"]//div[@class="crumbDrop j_CrumbDrop"]/a/text()')
        if type_id1 is not None:
            if len(type_id1) >1:
                type_id2=type_id1.split('/n')[-1]
            else:
                type_id2=''
            type_id1=type_id1.split('/n')[0]
            titles=[]
            title=''
            for t in item_titles:
                if not t.endswith('\n'):
                    title+=t.strip()
                elif t.endswith('\n'):
                    title+=t.strip()
                    if len(title)>5:
                        titles.append(title.strip())
                    title=''

            if len(titles)>19:
                for i,t in enumerate(titles):
                    if i<20:
                        good={
                            'mall': '1',
                            'rank': str(i+1),
                            'title': t.strip(),
                            'price': '0',
                            'turnover_index':'0',
                            'top_id': top_id.strip(),
                            'type_id1': type_id1.strip(),
                            'type_id2': type_id2.strip(),
                            'url': response.url
                        }

                        yield Good(good)

        for link in self.normal_url_extractor.extract_links(response):
            yield SplashRequest(link.url,callback=self.parse,args={'wait':0.5,'html':1,})
Example #30
0
File: tm.py Project: lzj3278/spider
    def parse_item(self, response):
        hxs = Selector(response)
        item_titles = extract(hxs, "//div[@id='J_ItemList']//p[@class='productTitle']/a/text()")
        top_id = extract_one(hxs, '//*[@id="J_CrumbSlideCon"]/li[2]/a/text()')
        type_id1 = extract(hxs, '//*[@id="J_CrumbSlideCon"]//div[@class="crumbDrop j_CrumbDrop"]/a/text()')
        if type_id1 is not None:
            if len(type_id1) > 1:
                type_id2 = type_id1.split('/n')[-1]
            else:
                type_id2 = ''
            type_id1 = type_id1.split('/n')[0]
            titles = []
            title = ''
            for t in item_titles:
                if not t.endswith('\n'):
                    title += t.strip()
                elif t.endswith('\n'):
                    title += t.strip()
                    if len(title) > 5:
                        titles.append(title.strip())
                    title = ''

            if len(titles) > 19:
                for i, t in enumerate(titles):
                    if i < 20:
                        good = {
                            'mall': '1',
                            'rank': str(i + 1),
                            'title': t.strip(),
                            'price': '0',
                            'turnover_index': '0',
                            'top_id': top_id.strip(),
                            'type_id1': type_id1.strip(),
                            'type_id2': type_id2.strip(),
                            'url': response.url
                        }

                        yield Good(good)

        for link in self.normal_url_extractor.extract_links(response):
            yield SplashRequest(link.url, callback=self.parse, args={'wait': 0.5, 'html': 1, })
def preprocess(no_wells):
    """Function initializes data, performs standardization, and train test split
    
    Parameters:
    ----------
    no_wells : int,
        number of evenly spaced wells and seismic samples to be evenly sampled 
        from seismic section.

        
    Returns
    -------
    seismic : array_like, shape(num_traces, depth samples)
        2-D array containing seismic section 
        
    model : array_like, shape(num_wells, depth samples)
        2-D array containing model section 

    """

    # get project root directory
    project_root = os.getcwd()

    if ~os.path.isdir(
            'data'):  # if data directory does not exists then extract
        extract('data.zip', project_root)

    # Load data
    seismic = np.load(join('data',
                           'poststack_seam_seismic.npy')).squeeze()[:, 50:]
    seismic = seismic[::2, :]

    # Load targets and standardize data
    model = np.load(join('data', 'seam_elastic_model.npy'))[::3, :, ::2][:, :,
                                                                         50:]
    model = model[:, 0, :] * model[:, 2, :]

    # standardize
    seismic, model = standardize(seismic, model, no_wells)

    return seismic, model
Example #32
0
 def criterion(output, target):
     prop, box = output
     target_instance = extract(target.cpu(), box, resize)
     loss, ok = 0., False
     for i, img in enumerate(target_instance):
         if prop[i] is not None:
             z_target = ab2z(img)
             loss += MCE(prop[i].cpu(), z_target, weights=w[z_target.argmax(dim=-1)]).mean()
             ok = True
     if not ok:
         loss = torch.tensor(0., requires_grad=True)
     return loss
Example #33
0
    def __init__(self, mapp, img, K):
        self.K = K
        self.Kinv = np.linalg.inv(self.K)
        self.pose = np.eye(4)
        self.h, self.w = img.shape[0:2]

        self.kpus, self.des = extract(img)
        self.kps = normalize(self.Kinv, self.kpus)
        self.pts = [None] * len(self.kps)

        self.id = len(mapp.frames)
        mapp.frames.append(self)
Example #34
0
    def fetch(self):
        """Download and extract the dataset."""

        home = self.home()
        if not path.exists(home):
            os.makedirs(home)

        # download archives
        archive_filenames = []
        for key, archive in self.ARCHIVES.iteritems():
            url = archive['url']
            sha1 = archive['sha1']
            basename = path.basename(url)
            archive_filename = path.join(home, basename)
            if not path.exists(archive_filename):
                download(url, archive_filename, sha1=sha1)
            archive_filenames += [(archive_filename, sha1)]
            self.ARCHIVES[key]['archive_filename'] = archive_filename

        # extract them
        for name, archive in self.ARCHIVES.iteritems():
            archive_dir = path.join(home, name)
            if os.path.exists(archive_dir):
                continue
            url = archive['url']
            sha1 = archive['sha1']
            archive_filename = archive['archive_filename']
            extract(archive_filename, home, sha1=sha1, verbose=True)
            # move around stuff if needed
            if 'moves' in archive:
                for move in archive['moves']:
                    src = self.home(move['source'])
                    dst = self.home(move['destination'])
                    # We can't use shutil here since the destination folder
                    # may already exist. Fortunately the distutils can help
                    # us here (see standard library).
                    dir_util.copy_tree(src, dst)
                    dir_util.remove_tree(src)
Example #35
0
    def fetch(self):
        """Download and extract the dataset."""

        home = self.home()
        if not path.exists(home):
            os.makedirs(home)

        # download archives
        archive_filenames = []
        for key, archive in self.ARCHIVES.iteritems():
            url = archive['url']
            sha1 = archive['sha1']
            basename = path.basename(url)
            archive_filename = path.join(home, basename)
            if not path.exists(archive_filename):
                download(url, archive_filename, sha1=sha1)
            archive_filenames += [(archive_filename, sha1)]
            self.ARCHIVES[key]['archive_filename'] = archive_filename

        # extract them
        for name, archive in self.ARCHIVES.iteritems():
            archive_dir = path.join(home, name)
            if os.path.exists(archive_dir):
                continue
            url = archive['url']
            sha1 = archive['sha1']
            archive_filename = archive['archive_filename']
            extract(archive_filename, home, sha1=sha1, verbose=True)
            # move around stuff if needed
            if 'moves' in archive:
                for move in archive['moves']:
                    src = self.home(move['source'])
                    dst = self.home(move['destination'])
                    # We can't use shutil here since the destination folder
                    # may already exist. Fortunately the distutils can help
                    # us here (see standard library).
                    dir_util.copy_tree(src, dst)
                    dir_util.remove_tree(src)
Example #36
0
    def fetch(self, download_if_missing=True):
        """Download and extract the dataset."""

        home = self.home()

        if not download_if_missing:
            raise IOError("'%s' exists!" % home)

        # download archive
        url = self.URL
        sha1 = self.SHA1
        basename = path.basename(url)
        archive_filename = path.join(home, basename)
        if not path.exists(archive_filename):
            if not download_if_missing:
                return
            if not path.exists(home):
                os.makedirs(home)
            download(url, archive_filename, sha1=sha1)

        # extract it
        if not path.exists(self.home(self.SUBDIR)):
            extract(archive_filename, home, sha1=sha1, verbose=True)
Example #37
0
    def fetch(self, download_if_missing=True):
        """Download and extract the dataset."""

        home = self.home()

        if not download_if_missing:
            raise IOError("'%s' exists!" % home)

        # download archive
        url = self.URL
        sha1 = self.SHA1
        basename = path.basename(url)
        archive_filename = path.join(home, basename)
        if not path.exists(archive_filename):
            if not download_if_missing:
                return
            if not path.exists(home):
                os.makedirs(home)
            download(url, archive_filename, sha1=sha1)

        # extract it
        if not path.exists(self.home(self.SUBDIR)):
            extract(archive_filename, home, sha1=sha1, verbose=True)
Example #38
0
    def parse_item(self,response):
        hxs=Selector(response)
        search_condition=extract_one(hxs,'//*[@id="J_CrumbSearchInuput"]/@value')
        item_titles=extract(hxs,"//div[@id='J_ItemList']//p[@class='productTitle']/a/text()")
        top_id=extract_one(hxs,'//*[@id="J_CrumbSlideCon"]/li[2]/a/text()')
        type_id1=extract_one(hxs,'//*[@id="J_CrumbSlideCon"]//div[@class="crumbDrop j_CrumbDrop"]/a/text()')
        if type_id1 is not None and search_condition is not None:
            type_id1=type_id1.split('/n')[0]
            titles=[]
            title=''
            for t in item_titles:
                if not t.endswith('\n'):
                    title+=t.strip()
                elif t.endswith('\n'):
                    title+=t.strip()
                    if len(title)>5:
                        titles.append(title.strip())
                    title=''

            if len(titles)>19 and search_condition!=type_id1:
                for i,t in enumerate(titles):
                    if i<20:
                        good={
                            'mall': '1',
                            'rank': str(i+1),
                            'title': t.strip(),
                            'price': '0',
                            'turnover_index':'0',
                            'top_id': top_id.strip(),
                            'type_id1': type_id1.strip(),
                            'type_id2': search_condition.strip(),
                            'url': response.url
                        }

                        yield Good(good)

        for link in self.needed_url_extractor.extract_links(response):
            if 'industryCatId' and 'cat' in link.url and 'post_fee' and 'brand' not in link.url:
                url = re.sub(r'sort=.*&', 'sort=d&', link.url)
                url = re.sub(r'search_condition=.*&', 'search_condition=7', url)
                url=re.sub(r'miaosha=.*&','miaosha=0&',url)
                url=re.sub(r'wwonline=.*&','wwonline=0&',url)
                yield SplashRequest(url, callback=self.parse_item, args={'wait': 0.5, 'html': 1,})
Example #39
0
 def get_and_run_installer(self, installer):
     installer_exe = os.path.abspath(os.path.basename(installer))
     if not os.path.isfile(installer_exe):
         print 'Downloading', installer, '..',
         installer_exe = download(installer)
         if installer_exe is None:
             print 'Download FAILED'
             return False
         print 'DONE'
     if os.path.splitext(installer_exe)[-1] in ['.zip']:
         install_path = self.get_install_path(installer_exe)
         if install_path is not None:
             if not os.path.isdir(install_path):
                 os.makedirs(install_path)
             return bool(extract(installer_exe, install_path))
     elif not start_installer(installer_exe):
         print 'Failed to start', installer_exe
         return False
     return True
Example #40
0
File: gui.py Project: pearu/iocbio
 def get_and_run_installer (self, installer):
     installer_exe = os.path.abspath(os.path.basename (installer))
     if not os.path.isfile (installer_exe):
         print 'Downloading', installer, '..',
         installer_exe = download (installer)
         if installer_exe is None:
             print 'Download FAILED'
             return False
         print 'DONE'
     if os.path.splitext(installer_exe)[-1] in ['.zip']:
         install_path = self.get_install_path(installer_exe)
         if install_path is not None:
             if not os.path.isdir (install_path):
                 os.makedirs (install_path)
             return bool(extract(installer_exe, install_path))
     elif not start_installer(installer_exe):
         print 'Failed to start', installer_exe
         return False
     return True
Example #41
0
def extract_triples(hbt_model, save_weights_path, path, author, subject_model,
                    object_model, tokenizer, id2rel):
    workbook = xlwt.Workbook(encoding='utf-8')
    ws = workbook.add_sheet('sheet1', cell_overwrite_ok=True)
    ws.write(0, 0, "head")
    ws.write(0, 1, "tail")
    ws.write(0, 2, "relation")

    hbt_model.load_weights(save_weights_path)
    triples = extract(path, subject_model, object_model, tokenizer, id2rel)
    count = 0

    triple_str = ""
    for triple_list in triples:
        for triple in triple_list:
            count += 1
            ws.write(count, 0, triple[0])
            ws.write(count, 1, triple[1])
            ws.write(count, 2, triple[2])
    workbook.save(path + author + ".xls")
Example #42
0
    def __init__(self,**kwargs):
        """
        Initialize a Bugzilla instance.
        
        Optional Arguments:
        -------------------
        url                     : The Bugzilla URL. 
                May or maynot end with /xmlrpc.cgi.
                If does not end with /xmlrpc.cgi, it will be assumed.
                If not provided, value of BUGZILLA_URL will be defaulted to.
        
        cookie_jar|cookiejar    : cookielib.CookieJar/MozillaCookieJar object.
        user|username|login     : Bugzilla login, usually an email id.
        password|passwd           : Password for bugzilla
        http_proxy|proxy        : String specifying the HTTP proxy of the
        bypass                  : boolean value, asks client to bypass 
                                    password auth and use cookies if present
                client's connection.
                Usually of the form server:port or http://server:port

        """
        # Initialize public attributes for unlogged unstance
        self.user_agent = USER_AGENT
        self.logged_in = False
        self.user_id = None

        self._init_private_data()
        # Extract provided values or default
        self._cookiejar = extract(kwargs, 'cookie_jar', 'cookiejar')
            
        self.url = extract(kwargs, 'url') or BUGZILLA_URL

        self.user = extract(kwargs, 'user', 'username', 'login') or ''
        self.password = extract(kwargs, 'password', 'passwd') or ''

        self.http_proxy = extract(kwargs, 'http_proxy', 'proxy') or ''
        self.bypass = extract(kwargs, 'bypass') or ''

        cookie_dir = extract(kwargs, 'cookie_dir') or COOKIE_DIR
        if not os.path.exists(cookie_dir):
            os.mkdir(cookie_dir)
        self.cookiefile = os.path.join(cookie_dir, '%s.cookie' % self.user)

        self.connect()
Example #43
0
def wrong_identifier(if_ast: dict, code, code_identifier_lst):
    code_condition_padded = utils.extract(if_ast["test"]["loc"],
                                          code,
                                          padding=5)
    condition_identifier_lst = []
    utils.dict_visitor(if_ast["test"], identifiers=condition_identifier_lst)
    if len(condition_identifier_lst):
        identifier_to_augment = random.choice(condition_identifier_lst)
        # TODO identifier must stand alone?
        for identifier in code_identifier_lst:
            identifier_start = identifier["loc"]["start"]["line"]
            augment_start = identifier_to_augment["loc"]["start"]["line"]
            if identifier_start < (
                    augment_start -
                    5) and identifier["name"] not in code_condition_padded:
                # TODO choose most similar identifier
                # TODO near neighborhood could be feasible
                identifier_to_augment["name"] = identifier["name"]
                random.shuffle(code_identifier_lst)
                return True
Example #44
0
    def predict(self, F, data_info, time_info):
        '''
        This function should provide predictions of labels on (test) data.
        Make sure that the predicted values are in the correct format for the scoring
        metric. For example, binary classification problems often expect predictions
        in the form of a discriminant value (if the area under the ROC curve it the metric)
        rather that predictions of the class labels themselves. 
        The function predict eventually returns probabilities or continuous values.
        '''

        info_dict = extract(data_info, time_info)
        print_time_info(info_dict)

        if params['algo'] == Algo.OLD_CODE:
            return self.mdl.predict(F, data_info, time_info)
        elif params['algo'] == Algo.ORIGINAL:
            return self._original_predict(F, info_dict)
        elif params['algo'] == Algo.FACEBOOK_LR:
            return self._facebook_lr_predict(F, info_dict)
        elif params['algo'] == Algo.BASIC:
            return self._basic_predict(F, info_dict)
Example #45
0
 def on_press(self, event):
     value = self.text_ctrl.GetValue()
     if not value:
         print("You didn't enter anything!")
     else:
         self.text_ctrl.Hide()
         png = wx.Image('img/whatever.png',
                        wx.BITMAP_TYPE_ANY).ConvertToBitmap()
         wx.StaticBitmap(self, -1, png, (0, 0),
                         (png.GetWidth(), png.GetHeight()))
         if os.path.exists("result.json"):
             os.remove("result.json")
         wordlist = utils.extract(value)
         words = ",".join(wordlist)
         path = utils.getPath()
         utils.crawl(words)
         output = utils.process()
         utils.writelist(output, path)
         png = wx.Image('img/finish.png',
                        wx.BITMAP_TYPE_ANY).ConvertToBitmap()
         wx.StaticBitmap(self, -1, png, (0, 0),
                         (png.GetWidth(), png.GetHeight()))
Example #46
0
 def get_and_install_source(self, installer):
     installer_file = os.path.abspath(os.path.basename (installer))
     if not os.path.isfile (installer_file):
         print 'Downloading', installer, '..',
         installer_file = download (installer)
         if installer_file is None:
             print 'Download FAILED'
             return False
         print 'DONE'
     install_path = self.get_install_path(installer_file)
     if install_path is not None:
         if not os.path.isdir (install_path):
             os.makedirs (install_path)
     else:
         install_path = '.'
     content = extract(installer_file, install_path)
     if not content:
         return False
     cwd = install_path
     for p in content:
         if os.path.isdir(p):
             cwd = p
             break
     return self.install_source(cwd)
Example #47
0
File: gui.py Project: pearu/iocbio
 def get_and_install_source(self, source):
     source_file = os.path.abspath(os.path.basename (source))
     if not os.path.isfile (source_file):
         print 'Downloading', source, '..',
         source_file = download (source)
         if source_file is None:
             print 'Download FAILED'
             return False
         print 'DONE'
     source_path = self.get_source_path(source_file)
     if source_path is not None:
         if not os.path.isdir (source_path):
             os.makedirs (source_path)
     else:
         source_path = '.'
     content = extract(source_file, source_path)
     if not content:
         return False
     cwd = source_path
     for p in content:
         if os.path.isdir(p):
             cwd = p
             break
     return self.install_source(os.path.abspath(cwd))
Example #48
0
def install():
    fetch('http://piumarta.com/software/peg/peg-%(peg)s.tar.gz')
    extract('peg-%(peg)s.tar.gz')
    make('peg-%(peg)s', 'CC=clang')
    make('peg-%(peg)s', 'PREFIX=%s install' % env.prefix)
Example #49
0
    def execute(self, args=None):
        validTemplateNames = ['helloworld', 'helloworldwebapp', 'pale'] + customStarterApps
        if not args:
            print self.shorthelp
            print 'available app templates:'
            print 'helloworld           -- simple helloworld app'
            print 'helloworldwebapp     -- simple helloworld app using webapp fmk'
            print 'xmppsendandreply     -- simple xmpp (instant message) send and reply'
            print 'emailreceive         -- simple e-mail receive example'
            print 'emailsendui          -- simple e-mail send example'
            print 'deferredemail        -- simple deferred lib queued e-mail send example'
            print 'starter_pale         -- a basic project layout with buckets for most things you could want and an import fix built in'
        else:
            templateName = args[0].lower()

            if templateName not in validTemplateNames:
                print 'Unknown app name %s' % args[0]
                return
            if templateName in customStarterApps:
                tarballurl = 'http://github.com/mpstx/appengine_py_%s/tarball/master' % templateName
                tmpPath = join(join(alePath('tmp'), templateName + '.tar.gz'))
                download(tarballurl, '%s.tar.gz' % templateName)
                logging.info("Extracting %s here" % templateName)
                os.system('tar xzf %s --strip 1 -C .' % tmpPath)
            elif templateName == 'helloworld':
                logging.info('creating ./helloworld.py')
                FILE = open('./helloworld.py', 'w')
                FILE.write("""
print 'Content-Type: text/plain'
print ''
print 'Hello, world!  This is a bare bones app engine application'
""")
                FILE.close()

                logging.info('creating ./app.yaml')
                FILE = open('./app.yaml', 'w')
                FILE.write("""
application: helloworld
version: 1
runtime: python
api_version: 1

handlers:
- url: /.*
  script: helloworld.py        
            """)
                FILE.close()
            elif templateName == 'helloworldwebapp':
                logging.info('creating ./helloworld.py')
                FILE = open('./helloworld.py', 'w')
                FILE.write("""
from google.appengine.ext import webapp
from google.appengine.ext.webapp.util import run_wsgi_app

class MainPage(webapp.RequestHandler):
    def get(self):
        self.response.headers['Content-Type'] = 'text/plain'
        self.response.out.write('Hello, webapp World!')

application = webapp.WSGIApplication(
                                     [('/', MainPage)],
                                     debug=True)

def main():
    run_wsgi_app(application)

if __name__ == "__main__":
    main()
""")
                FILE.close()

                logging.info('creating ./app.yaml')
                FILE = open('./app.yaml', 'w')
                FILE.write("""
application: helloworldwebapp
version: 1
runtime: python
api_version: 1

handlers:
- url: /.*
  script: helloworld.py        
""")
                FILE.close()
            else:
                pkgPath = join(join(alePath('recipes_installed'), 'createapp'), 'pkgs')
                templateZipPath = join(pkgPath, '%s.zip' % templateName)

                if os.path.exists(templateZipPath):
                    extract(templateZipPath, '.')
                    gitignore('tmp')
                else:
                    logging.error('Could not find template: %s' % templateName)
                    return

            return 0
Example #50
0
def install():
    fetch("http://www.fastcgi.com/dist/fcgi-%(fcgi)s.tar.gz")
    extract("fcgi-%(fcgi)s.tar.gz")
    configure("fcgi-%(fcgi)s", ["--prefix=%s" % env.prefix])
    make("fcgi-%(fcgi)s")
    make("fcgi-%(fcgi)s", "install")
import sys
import time
import math
import utils
from pyspark.context import SparkContext

if (len(sys.argv) > 1):
	hdfs_file_path = "/user/lsde02/data/%s/*.gz" % sys.argv[1]
else:
	hdfs_file_path = "/user/lsde02/data/1901/*.gz"
hdfs_results_path = "/user/lsde02/results/"
start_time = time.strftime("%Y-%m-%d-%H-%M-%S")

sc = SparkContext()
context = sc.textFile(hdfs_file_path)
stations = context.flatMap(lambda x: [utils.extract(record) for record in x.splitlines()])
stations = stations.filter(lambda x: 'longitude' in x[1] and 'latitude' in x[1])
stations.persist()

# Do computations on month level
month_data = stations.map(lambda x:((x[0][0], x[0][1], x[0][3]), (x[1]['temp'], x[1]['wind-speed'], x[1]['sky-condition'], x[1]['visibility'], \
				x[1]['wind-direction'])))
month_data = month_data.combineByKey(lambda value: (x['temp'], 1, x['wind-speed'], 1, x['sky-condition'], 1, x['visibility'], 1, \
				math.sin(x['wind-direction'])*math.pi/180., math.cos(x['wind-direction']*math.pi/180.)),\
				lambda x, value: (x[0] + value[0], value[1] + 1, x[2]+value[2], 1 + value[3], x[4] + value[4], 1 + value[5],\
					x[6]+value[6], 1 + value[7], x[8] + value[8], x[9] + value[9]),\
				lambda x, y: (x[0]+y[0], x[1]+y[1], x[2]+y[2], x[3]+y[3], x[4]+y[4], x[5]+y[5], x[6]+y[6], x[7]+y[7], x[8]+y[8]\
					x[9]+y[9])) 
month_data = month_data.map(lambda (label, (x1, c1, x2, c2, x3, c3, x4, c4, x5a, x5b)): (label, (x1/c1, x2/c2, x3/c3, x4/c4, math.atan2(x5a, x5b))))
month_data = month_data.coalesce(1, True)
month_avg.saveAsTextFile("%s%s-%s" % (hdfs_results_path, start_time, 'all'))
Example #52
0
File: shell.py Project: mattorb/Ale
 def install(self, args=None):
     dlFile = download('http://ipython.scipy.org/dist/0.10/ipython-0.10.tar.gz', 'ipython-0.10.tar.gz')
     extract(dlFile, extractPath)
Example #53
0
 def _be(self, *keys):
     """
     Private conviniance wrapper around extract. 
     Hash defaults to self._hash
     """
     return extract(self._hash, *keys)
Example #54
0
    def __init__(self, bug, hash):
        """
        Initialize attachments

        :arg hash: Dictionary of attachment details
        :arg bug: Instance of :class:`~bz_xmlrpc.classes.Bug` object

        :return: Instance of :class:`Attachment`
        .. note::
            No need to use this directly. 
            Use :meth:`~bz_xmlrpc.classes.Bug.get_attachments()`

        """
        self._hash = hash
        self.id = extract(hash, "id", "attach_id")
        self.content_type = extract(hash, "content_type", "mimetype")
        self.creation_time = to_datetime(extract(hash, "creation_time", "creation_ts"))
        self.attacher = extract(hash, "attacher", "submitter_id")
        self.description = extract(hash, "description")
        self.file_name = extract(hash, "file_name", "filename")
        self.bug = bug
        self.is_private = bool(extract(hash, "is_private", "isprivate"))
        self.is_obsolete = bool(extract(hash, "is_obsolete", "isobsolete"))
        self.is_patch = bool(extract(hash, "is_patch", "ispatch"))
        self.is_url = bool(extract(hash, "is_url", "isurl"))
        self.last_change_time = to_datetime(extract(hash, "last_change_time", "modification_time"))

        if self.id and self.bug:
            self.fetch_url = bug.bz.url.replace("xmlrpc.cgi", "attachment.cgi?id=%s" % self.id)
Example #55
0
    def install(self, args=None):
        dlFile = download('http://github.com/ishikawa/modipyd/zipball/release-1-1-rc1', 'ishikawa-modipyd.zip')
        extract(dlFile, extractPath)

        os.system('chmod +x %s' % join(join(join(extractPath, 'ishikawa-modipyd-1516eeb'), 'bin'), 'modipyd'))
        os.system('chmod +x %s' % join(join(join(extractPath, 'ishikawa-modipyd-1516eeb'), 'bin'), 'pyautotest'))