def process_source(src): """ Parse the source according to the given schema, create the list of dicts with records of Authorities. """ schema= define_schema(AuthorityProfile) if VERBOSE: print "Extracting the data from the source CSV file." try: authorities= EX.extract(src, schema) if VERBOSE: print "Extracting complete successfully." except Exception as e: print e return False if VERBOSE: print "Processing data from the obtained dataset." try: authorities= process_extracted_data(authorities) except Exception as e: print e return False if VERBOSE: print "Processing complete." return True
class DOMHTMLPersonGenresParser(DOMParserBase): """Parser for the "by genre" and "by keywords" pages of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: gparser = DOMHTMLPersonGenresParser() result = gparser.parse(bygenre_html_string) """ kind = 'genres' _containsObjects = True extractors = [ Extractor(label='genres', group="//b/a[@name]/following-sibling::a[1]", group_key="./text()", group_key_normalize=lambda x: x.lower(), path="../../following-sibling::ol[1]/li//a[1]", attrs=Attribute(key=None, multi=True, path={ 'link': "./@href", 'title': "./text()", 'info': "./following-sibling::text()" }, postprocess=lambda x: \ build_movie(x.get('title') + \ x.get('info').split('[')[0], analyze_imdbid(x.get('link'))))) ] def postprocess_data(self, data): if len(data) == 0: return {} return {self.kind: data}
class DOMHTMLSearchMovieParser(DOMParserBase): """Parse the html page that the IMDb web server shows when the "new search system" is used, for movies.""" _BaseParser = DOMBasicMovieParser _notDirectHitTitle = '<title>imdb title' _titleBuilder = lambda self, x: build_title(x) _linkPrefix = '/title/tt' _attrs = [Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'info': ".//text()", #'akas': ".//div[@class='_imdbpyAKA']//text()" 'akas': ".//p[@class='find-aka']//text()" }, postprocess=lambda x: ( analyze_imdbid(x.get('link') or u''), custom_analyze_title(x.get('info') or u''), x.get('akas') ))] extractors = [Extractor(label='search', path="//td[3]/a[starts-with(@href, '/title/tt')]/..", attrs=_attrs)] def _init(self): self.url = u'' def _reset(self): self.url = u'' def preprocess_string(self, html_string): if self._notDirectHitTitle in html_string[:1024].lower(): if self._linkPrefix == '/title/tt': # Only for movies. html_string = html_string.replace('(TV mini-series)', '(mini)') html_string = html_string.replace('<p class="find-aka">', '<p class="find-aka">::') #html_string = _reAKAStitles.sub( # r'<div class="_imdbpyAKA">\1::</div>\2', html_string) return html_string # Direct hit! dbme = self._BaseParser(useModule=self._useModule) res = dbme.parse(html_string, url=self.url) if not res: return u'' res = res['data'] if not (res and res[0]): return u'' link = '%s%s' % (self._linkPrefix, res[0][0]) # # Tries to cope with companies for which links to pro.imdb.com # # are missing. # link = self.url.replace(imdbURL_base[:-1], '') title = self._titleBuilder(res[0][1]) if not (link and title): return u'' link = link.replace('http://pro.imdb.com', '') new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link, title) return new_html def postprocess_data(self, data): if not data.has_key('data'): data['data'] = [] results = getattr(self, 'results', None) if results is not None: data['data'][:] = data['data'][:results] # Horrible hack to support AKAs. if data and data['data'] and len(data['data'][0]) == 3 and \ isinstance(data['data'][0], tuple): for idx, datum in enumerate(data['data']): if datum[2] is not None: akas = filter(None, datum[2].split('::')) if self._linkPrefix == '/title/tt': akas = [a.replace('" - ', '::').rstrip() for a in akas] akas = [a.replace('aka "', '', 1).lstrip() for a in akas] datum[1]['akas'] = akas data['data'][idx] = (datum[0], datum[1]) else: data['data'][idx] = (datum[0], datum[1]) return data def add_refs(self, data): return data
class DOMHTMLBioParser(DOMParserBase): """Parser for the "biography" page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: bioparser = DOMHTMLBioParser() result = bioparser.parse(biography_html_string) """ _defGetRefs = True _birth_attrs = [Attribute(key='birth date', path={ 'day': "./a[starts-with(@href, " \ "'/OnThisDay?')]/text()", 'year': "./a[starts-with(@href, " \ "'/BornInYear?')]/text()" }, postprocess=lambda x: build_date(x)), Attribute(key='birth notes', path="./a[starts-with(@href, '/BornWhere?')]/text()")] _death_attrs = [Attribute(key='death date', path={ 'day': "./a[starts-with(@href, " \ "'/OnThisDay?')]/text()", 'year': "./a[starts-with(@href, " \ "'/DiedInYear?')]/text()" }, postprocess=lambda x: build_date(x)), Attribute(key='death notes', path="./text()", # TODO: check if this slicing is always correct postprocess=lambda x: u''.join(x).strip()[2:])] extractors = [ Extractor(label='birth info', path="//div[h5='Date of Birth']", attrs=_birth_attrs), Extractor(label='death info', path="//div[h5='Date of Death']", attrs=_death_attrs), Extractor(label='nick names', path="//div[h5='Nickname']", attrs=Attribute(key='nick names', path="./text()", joiner='|', postprocess=lambda x: [n.strip().replace(' (', '::(', 1) for n in x.split('|') if n.strip()])), Extractor(label='birth name', path="//div[h5='Birth Name']", attrs=Attribute(key='birth name', path="./text()", postprocess=lambda x: canonicalName(x.strip()))), Extractor(label='height', path="//div[h5='Height']", attrs=Attribute(key='height', path="./text()", postprocess=lambda x: x.strip())), Extractor(label='mini biography', path="//div[h5='Mini Biography']", attrs=Attribute(key='mini biography', multi=True, path={ 'bio': "./p//text()", 'by': "./b/following-sibling::a/text()" }, postprocess=lambda x: "%s::%s" % \ (x.get('bio').strip(), (x.get('by') or u'').strip() or u'Anonymous'))), Extractor(label='spouse', path="//div[h5='Spouse']/table/tr", attrs=Attribute(key='spouse', multi=True, path={ 'name': "./td[1]//text()", 'info': "./td[2]//text()" }, postprocess=lambda x: "%s::%s" % \ (x.get('name').strip(), x.get('info').strip()))), Extractor(label='trade mark', path="//div[h5='Trade Mark']/p", attrs=Attribute(key='trade mark', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='trivia', path="//div[h5='Trivia']/p", attrs=Attribute(key='trivia', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='quotes', path="//div[h5='Personal Quotes']/p", attrs=Attribute(key='quotes', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='salary', path="//div[h5='Salary']/table/tr", attrs=Attribute(key='salary history', multi=True, path={ 'title': "./td[1]//text()", 'info': "./td[2]/text()", }, postprocess=lambda x: "%s::%s" % \ (x.get('title').strip(), x.get('info').strip()))), Extractor(label='where now', path="//div[h5='Where Are They Now']/p", attrs=Attribute(key='where now', multi=True, path=".//text()", postprocess=lambda x: x.strip())), ] preprocessors = [(re.compile('(<h5>)', re.I), r'</div><div class="_imdbpy">\1'), (re.compile('(</table>\n</div>\s+)</div>', re.I + re.DOTALL), r'\1'), (re.compile('(<div id="tn15bot">)'), r'</div>\1'), (re.compile('\.<br><br>([^\s])', re.I), r'. \1')]
class DOMHTMLMaindetailsParser(DOMParserBase): """Parser for the "categorized" (maindetails) page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: cparser = DOMHTMLMaindetailsParser() result = cparser.parse(categorized_html_string) """ _containsObjects = True _birth_attrs = [Attribute(key='birth date', path={ 'day': "./a[starts-with(@href, " \ "'/OnThisDay?')]/text()", 'year': "./a[starts-with(@href, " \ "'/BornInYear?')]/text()" }, postprocess=lambda x: build_date(x)), Attribute(key='birth notes', path="./a[starts-with(@href, '/BornWhere?')]/text()")] _death_attrs = [Attribute(key='death date', path={ 'day': "./a[starts-with(@href, " \ "'/OnThisDay?')]/text()", 'year': "./a[starts-with(@href, " \ "'/DiedInYear?')]/text()" }, postprocess=lambda x: build_date(x)), Attribute(key='death notes', path="./text()", # TODO: check if this slicing is always correct postprocess=lambda x: x.strip()[2:])] _film_attrs = [ Attribute(key=None, multi=True, path={ 'link': "./a[1]/@href", 'title': ".//text()", 'status': "./i/a//text()", 'roleID': "./div[@class='_imdbpyrole']/@roleid" }, postprocess=lambda x: build_movie( x.get('title') or u'', movieID=analyze_imdbid(x.get('link') or u''), roleID=(x.get('roleID') or u'').split('/'), status=x.get('status') or None)) ] extractors = [ Extractor(label='page title', path="//title", attrs=Attribute( key='name', path="./text()", postprocess=lambda x: analyze_name(x, canonical=1))), Extractor(label='birth info', path="//div[h5='Date of Birth:']", attrs=_birth_attrs), Extractor(label='death info', path="//div[h5='Date of Death:']", attrs=_death_attrs), Extractor(label='headshot', path="//a[@name='headshot']", attrs=Attribute(key='headshot', path="./img/@src")), Extractor(label='akas', path="//div[h5='Alternate Names:']", attrs=Attribute( key='akas', path="./text()", postprocess=lambda x: x.strip().split(' | '))), Extractor(label='filmography', group="//div[@class='filmo'][h5]", group_key="./h5/a[@name]/text()", group_key_normalize=lambda x: x.lower()[:-1], path="./ol/li", attrs=_film_attrs) ] preprocessors = [ # XXX: check that this doesn't cut "status" or other info... (re.compile(r'<br>(\.\.\.| ?).+?</li>', re.I | re.M | re.S), '</li>'), (_reRoles, _manageRoles) ]
class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser): """Parser for the "biography" page of a given character. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: bparser = DOMHTMLCharacterMaindetailsParser() result = bparser.parse(character_biography_html_string) """ _containsObjects = True _film_attrs = [Attribute(key=None, multi=True, path={ 'link': "./a[1]/@href", 'title': ".//text()", 'status': "./i/a//text()", 'roleID': "./a/@href" }, postprocess=lambda x: build_movie(x.get('title') or u'', movieID=analyze_imdbid(x.get('link') or u''), roleID=_personIDs.findall(x.get('roleID') or u''), status=x.get('status') or None, _parsingCharacter=True))] extractors = [ Extractor(label='title', path="//title", attrs=Attribute(key='name', path="./text()", postprocess=lambda x: \ x.replace(' (Character)', '').strip())), Extractor(label='headshot', path="//a[@name='headshot']", attrs=Attribute(key='headshot', path="./img/@src")), Extractor(label='akas', path="//div[h5='Alternate Names:']", attrs=Attribute(key='akas', path="./text()", postprocess=lambda x: x.strip().split(' / '))), Extractor(label='filmography', path="//div[@class='filmo'][not(h5)]/ol/li", attrs=_film_attrs), Extractor(label='filmography sections', group="//div[@class='filmo'][h5]", group_key="./h5/a/text()", group_key_normalize=lambda x: x.lower()[:-1], path="./ol/li", attrs=_film_attrs), ] preprocessors = [ # Check that this doesn't cut "status"... (re.compile(r'<br>(\.\.\.| ).+?</li>', re.I | re.M), '</li>')]
class DOMHTMLResumeParser(DOMParserBase): """Parser for the "resume" page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: resumeparser = DOMHTMLResumeParser() result = resumeparser.parse(resume_html_string) """ _defGetRefs = True extractors = [ Extractor( label='info', group="//div[@class='section_box']", group_key="./h3/text()", group_key_normalize=lambda x: x.lower().replace(' ', '_'), path="./ul[@class='resume_section_multi_list']//li", attrs=Attribute( key=None, multi=True, path={ 'title': ".//b//text()", 'desc': ".//text()", }, postprocess=lambda x: (x.get('title'), x.get('desc').strip().replace('\n', ' ')))), Extractor(label='other_info', group="//div[@class='section_box']", group_key="./h3/text()", group_key_normalize=lambda x: x.lower().replace(' ', '_'), path="./ul[@class='_imdbpy']//li", attrs=Attribute( key=None, multi=True, path=".//text()", postprocess=lambda x: x.strip().replace('\n', ' '))), Extractor( label='credits', group="//div[@class='section_box']", group_key="./h3/text()", group_key_normalize=lambda x: x.lower().replace(' ', '_'), path="./table[@class='credits']//tr", attrs=Attribute( key=None, multi=True, path={ '0': ".//td[1]//text()", '1': ".//td[2]//text()", '2': ".//td[3]//text()", }, postprocess=lambda x: [x.get( '0'), x.get('1'), x.get('2')])), Extractor(label='mini_info', path="//div[@class='center']", attrs=Attribute( key='mini_info', path=".//text()", postprocess=lambda x: x.strip().replace('\n', ' '))), Extractor(label='name', path="//div[@class='center']/h1[@id='preview_user_name']", attrs=Attribute( key='name', path=".//text()", postprocess=lambda x: x.strip().replace('\n', ' '))), Extractor(label='resume_bio', path="//div[@id='resume_rendered_html']//p", attrs=Attribute(key='resume_bio', multi=True, path=".//text()")), ] preprocessors = [ (re.compile('(<ul>)', re.I), r'<ul class="_imdbpy">\1'), ] def postprocess_data(self, data): for key in data.keys(): if data[key] == '': del data[key] if key in ('mini_info', 'name', 'resume_bio'): if key == 'resume_bio': data[key] = "".join(data[key]).strip() continue if len(data[key][0]) == 3: for item in data[key]: item[:] = [x for x in item if not x is None] continue if len(data[key][0]) == 2: new_key = {} for item in data[key]: if item[0] is None: continue if ':' in item[0]: if item[1].replace(item[0], '')[1:].strip() == '': continue new_key[item[0].strip().replace(':', '')] = item[1].replace( item[0], '')[1:].strip() else: new_key[item[0]] = item[1] data[key] = new_key new_data = {'resume': data} return new_data
def crawl(self, download=True): logger.info('') logger.info('Start crawling %s (%s)' % (self.name, self.url)) # Custom definitions metapath = eval(self.meta_xpath) if self.meta_xpath else None expand_rules = self.expand_rules.split('\n') \ if self.expand_rules else None refine_rules = [item.strip() for item in self.refine_rules.split('\n') if item.strip()] extrapath = [item.strip() for item in self.extra_xpath.split('\n') if item.strip()] proxy = self.proxy.get_dict() if self.proxy else None logger.info('Use proxy server: %s' % self.proxy) ua = self.user_agent.value if self.user_agent else None logger.info('Use user agent: %s' % self.user_agent) # Initialize extractor self._extractor = Extractor(self.url, settings.CRAWL_ROOT, proxies=proxy, user_agent=ua) make_root = False if self.link_xpath.startswith('/+'): make_root = True self.link_xpath = self.link_xpath[2:] all_links = self._extractor.extract_links( xpath=self.link_xpath, expand_rules=expand_rules, depth=self.crawl_depth, make_root=make_root) logger.info('%d link(s) found' % len(all_links)) # Just dry running or real download if download: blacklist = [] local_content = [] if self.black_words: blacklist = self.black_words.words.split('\n') for link in all_links: try: link_url = link['url'] if LocalContent.objects.filter(url=link_url).count(): logger.info('Bypass %s' % link_url) continue logger.info('Download %s' % link_url) location = datetime.now().strftime('%Y/%m/%d') location = os.path.join(settings.CRAWL_ROOT, location) sub_extr = Extractor(link_url, location, proxy) if self.content_type: base_meta = {'type': self.content_type.name} else: base_meta = None local_path = sub_extr.extract_content( self.content_xpath, with_image=self.download_image, metapath=metapath, extrapath=extrapath, custom_rules=refine_rules, blacklist=blacklist, metadata=base_meta) content = LocalContent(url=link_url, source=self, local_path=local_path) content.save() local_content.append(content) except: logger.exception('Error when extracting %s' % link['url']) paths = [lc.local_path for lc in local_content] return paths else: return all_links
class Prediction: def __init__(self): rospy.init_node("onlinePrediction", anonymous=True) self.__controlSignal = False self.__controlState = 0 self.__isAngleValid = False self.__isImuValid = False self.__extract = Extractor() self.__data = [] self.__isPredictionValid = False self.__model = Model() self.__collectData = [] self.__isAngleInit = False self.__initAngleData = [] self.__angleOffset = 0 self.__pubFuture = rospy.Publisher("predicted_trajectory", FutureTrajectory, queue_size=4) self.__pubLocation = rospy.Publisher("nowLocation", Point, queue_size=4) rospy.Subscriber("robot_odm", Pose2D, self.odmCallback, queue_size=4) rospy.Subscriber("openpose_ros/human_depth_list", HumanDepthList, self.humanListCallback, queue_size=50) rospy.Subscriber("/robot/controlSignal", Bool, self.controlSignalCallback) rospy.Subscriber("/imu", Imu, self.imuCallback, queue_size=3) rospy.Subscriber("/imu_angle", Vector3, self.angleCallback) def odmCallback(self, odm): self.__odm = odm def controlSignalCallback(self, controlSignal): self.__controlSignal = controlSignal.data def humanListCallback(self, humanDepthList): if self.__isImuValid is False or self.__isAngleValid is False: return if self.__controlState == 0: # 控制状态为0表明还未开始记录 if self.__controlSignal is False: return else: print "start recording!" self.__controlState = 1 if self.__controlState == 1: if self.__controlSignal is True: # 记录数据 curData = self.__extract.extractHumanPose( humanDepthList, self.__odm) curData.extend([ self.__angleData.x, self.__angleData.y, self.__angleData.z, self.__imuData.linear_acceleration.x, self.__imuData.linear_acceleration.y, self.__imuData.linear_acceleration.z, self.__imuData.angular_velocity.x, self.__imuData.angular_velocity.y, self.__imuData.angular_velocity.z ]) self.__data.append(curData) nowLocation = Point() nowLocation.x = curData[0] nowLocation.y = curData[1] nowLocation.z = 0 self.__pubLocation.publish(nowLocation) if self.__isAngleInit is False: self.__initAngleData.append(self.__angleData.z) if len(self.__initAngleData) >= 8: self.__isAngleInit = True self.__angleOffset = sum(self.__initAngleData) / len( self.__initAngleData) self.__initAngleData = [] if len(self.__data) >= 11: self.__isPredictionValid = True if len(self.__data) > 11: del self.__data[0] if self.__isPredictionValid is True: # print self.__odm npTrajectory = np.array(self.__data) cur = time.time() npFuture = self.__model.predictFuture( npTrajectory[:, srcIndex]) Weights = np.matmul(T_inv, npFuture) # plotLocationData(npTrajectory.transpose(), npFuture.transpose()) self.__collectData.append([ npTrajectory[:, srcIndex].transpose(), npFuture.transpose() ]) futureTrajectory = FutureTrajectory() for i in range(0, 8): point = Point() point.x = npFuture[i][0] point.y = npFuture[i][1] point.z = 0 futureTrajectory.locations.append(point) for i in range(0, 3): for j in range(0, 2): futureTrajectory.weights.append( Float64(Weights[i][j] / 1000)) futureTrajectory.weights.append( Float64(self.__angleData.z - self.__angleOffset)) print((self.__angleData.z - self.__angleOffset) / 180 * 3.14159) now = time.time() # print( now - cur ) self.__pubFuture.publish(futureTrajectory) else: self.__controlState = 0 # 将数据存成文件并且清空数据 nowTime = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time())) fileName = nowTime + ' Line3' + '.npy' # plotData(self.__data) np.save(fileName, self.__data) self.__data = [] self.__extract = Extractor() print "save as " + fileName self.__isAngleInit = False ''' for i in range(len(self.__collectData)): if i % 5 == 0: plotLocationData(self.__collectData[i][0], self.__collectData[i][1]) ''' def imuCallback(self, imuData): self.__isImuValid = True self.__imuData = imuData def angleCallback(self, angleData): self.__isAngleValid = True self.__angleData = angleData def run(self): rospy.spin()
class CollectData: def __init__(self): rospy.init_node("CollectData", anonymous=True) rospy.Subscriber("robot_odm", Pose2D, self.odmCallback) rospy.Subscriber("openpose_ros/human_depth_list", HumanDepthList, self.humanListCallback) rospy.Subscriber("/robot/controlSignal", Bool, self.controlSignalCallback) rospy.Subscriber("/imu", Imu, self.imuCallback) rospy.Subscriber("/imu_angle", Vector3, self.angleCallback) self.__controlSignal = False self.__controlState = 0 self.__isAngleValid = False self.__isImuValid = False self.__extract = Extractor() self.__data = [] def odmCallback(self, odm): self.__odm = odm def controlSignalCallback(self, controlSignal): if controlSignal.data is True: time.sleep(12) self.__controlSignal = controlSignal.data def humanListCallback(self, humanDepthList): if self.__isImuValid is False or self.__isAngleValid is False: return if self.__controlState == 0: # 控制状态为0表明还未开始记录 if self.__controlSignal is False: return else: print "start recording!" self.__controlState = 1 if self.__controlState == 1: if self.__controlSignal is True: # 记录数据 curData = self.__extract.extractHumanPose( humanDepthList, self.__odm) curData.extend([ self.__angleData.x, self.__angleData.y, self.__angleData.z, self.__imuData.linear_acceleration.x, self.__imuData.linear_acceleration.y, self.__imuData.linear_acceleration.z, self.__imuData.angular_velocity.x, self.__imuData.angular_velocity.y, self.__imuData.angular_velocity.z, time.time(), self.__odm.x, self.__odm.y, self.__odm.theta ]) self.__data.append(curData) else: self.__controlState = 0 # 将数据存成文件并且清空数据 nowTime = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time())) fileName = nowTime + ' Turn1' + '.npy' result = plotData(self.__data) if result is True: np.save(fileName, self.__data) print "save as " + fileName else: print "wrong, reject" self.__data = [] self.__extract = Extractor() def imuCallback(self, imuData): self.__isImuValid = True self.__imuData = imuData def angleCallback(self, angleData): self.__isAngleValid = True self.__angleData = angleData def run(self): rospy.spin()
class Source(models.Model): """ This could be a single site or part of a site which contains wanted content """ url = models.CharField(max_length=256) name = models.CharField(max_length=256, blank=True, null=True) # Links section link_xpath = models.CharField(max_length=255) expand_rules = models.TextField(blank=True, null=True) crawl_depth = models.PositiveIntegerField(default=1) # Content section content_xpath = models.CharField(max_length=255, blank=True, null=True) content_type = models.ForeignKey('ContentType', blank=True, null=True) meta_xpath = models.TextField(default='', blank=True) extra_xpath = models.TextField(default='', blank=True) refine_rules = models.TextField(default='', blank=True) active = models.BooleanField(default=True) download_image = models.BooleanField(default=True) # Extra settings black_words = models.ForeignKey('WordSet', blank=True, null=True) proxy = models.ForeignKey('ProxyServer', blank=True, null=True) user_agent = models.ForeignKey('UserAgent', blank=True, null=True) def __unicode__(self): return '%s' % (self.name or self.url) def get_extractor(self): return self._extractor def crawl(self, download=True): logger.info('') logger.info('Start crawling %s (%s)' % (self.name, self.url)) # Custom definitions metapath = eval(self.meta_xpath) if self.meta_xpath else None expand_rules = self.expand_rules.split('\n') \ if self.expand_rules else None refine_rules = [item.strip() for item in self.refine_rules.split('\n') if item.strip()] extrapath = [item.strip() for item in self.extra_xpath.split('\n') if item.strip()] proxy = self.proxy.get_dict() if self.proxy else None logger.info('Use proxy server: %s' % self.proxy) ua = self.user_agent.value if self.user_agent else None logger.info('Use user agent: %s' % self.user_agent) # Initialize extractor self._extractor = Extractor(self.url, settings.CRAWL_ROOT, proxies=proxy, user_agent=ua) make_root = False if self.link_xpath.startswith('/+'): make_root = True self.link_xpath = self.link_xpath[2:] all_links = self._extractor.extract_links( xpath=self.link_xpath, expand_rules=expand_rules, depth=self.crawl_depth, make_root=make_root) logger.info('%d link(s) found' % len(all_links)) # Just dry running or real download if download: blacklist = [] local_content = [] if self.black_words: blacklist = self.black_words.words.split('\n') for link in all_links: try: link_url = link['url'] if LocalContent.objects.filter(url=link_url).count(): logger.info('Bypass %s' % link_url) continue logger.info('Download %s' % link_url) location = datetime.now().strftime('%Y/%m/%d') location = os.path.join(settings.CRAWL_ROOT, location) sub_extr = Extractor(link_url, location, proxy) if self.content_type: base_meta = {'type': self.content_type.name} else: base_meta = None local_path = sub_extr.extract_content( self.content_xpath, with_image=self.download_image, metapath=metapath, extrapath=extrapath, custom_rules=refine_rules, blacklist=blacklist, metadata=base_meta) content = LocalContent(url=link_url, source=self, local_path=local_path) content.save() local_content.append(content) except: logger.exception('Error when extracting %s' % link['url']) paths = [lc.local_path for lc in local_content] return paths else: return all_links
def __init__(self): self.IO = IOManager() self.exactor = Extractor('illegal.txt')
class DIRT: def __init__(self): self.IO = IOManager() self.exactor = Extractor('illegal.txt') def _construct_database(self, corpus_path): """Construct the database based on the corpus. Args: corpus_path: str - The path of corpus. Returns: Database """ database = Database() # construct the database for words, poss in self.IO.read_sentences(corpus_path): triples = self.exactor.extract(words, poss) for triple in triples: database.insert(triple) return database def run(self, corpus_path, test_path, minfreq): self._database = self._construct_database(corpus_path) before_unique, before_total = self._stas(self._database) self._database.apply_minfreq(minfreq) after_unique, after_total = self._stas(self._database) sim = Similarity(self._database) test_phrases = self.IO.read_phrases(test_path) with open('trace.txt', 'w', encoding='utf8') as f: # Write the head line. args = [before_unique, after_unique, before_total, after_total] f.write('\n') self._write_head(f, args) for phrase in test_phrases: most_similar = self._find_k_similar(phrase, sim, 5) self._write_result(f, phrase, most_similar) def _stas(self, database): """Return the statistic of the database. """ return len(database), database.path_number() def _write_head(self, f, args): """Write the head line for output. """ s = 'Found {a} distinct paths, {b} after minfreq filtering.\n' s = s.format(a=args[0], b=args[1]) f.write(s) s = 'Found {a} path instances, {b} after minfreq filtering.\n' s = s.format(a=args[2], b=args[3]) f.write(s) f.write('\n') def _find_k_similar(self, phrase, sim, k=5): """Find the k most similar paths. If phrase does not in the database, reutrn None Args: phrase: str sim: Similarity k: int Returns: a list of tuple with size of k. Each tuple contains the path and corrsponding score. """ if phrase not in self._database: return None reval = [(path, sim.PathSim(phrase, path)) for path in self._database] reval.sort(key=lambda x: x[-1], reverse=True) # To deal with tie cases. value = reval[k - 1][-1] reval = [v for v in reval if v[-1] >= value] return reval def _write_result(self, f, phrase, result): """Write thr result into files. Args: f: file phrase: str result: list(tuple(path, score)) """ s = 'MOST SIMILAR RULES FOR: {a}\n'.format(a=phrase) n = 'This phrase is not in the triple database.\n' t = '{a}. {b}\t{c}\n' f.write(s) if result is None: f.write(n) else: for i, item in enumerate(result): path = str(item[0]) score = str(item[-1]) tt = t.format(a=str(i + 1), b=path, c=score) f.write(tt) f.write('\n')
# Initialize Corpuses corpus = {} for year in years: corpus[year] = Corpus(year=year - 2015) # Build Corpuses: Load cleaned articles, build phrasers, dictionary, and BOWs for year in years: print("Corpus " + str(year) + ":") corpus[year].build_corpus() print("Corpus " + str(year) + " Done\n") # Extract Keywords from each Article using tf-ifd for year in years: print("Corpus " + str(year) + ":") corpus[year].build_tfidf() corpus[year].extractor = Extractor(corpus[year]) corpus[year].extractor.extract(k=num_keywords) print("Corpus " + str(year) + " Done\n") # Build LDA model, cluster articles into issues for year in years: print("Corpus " + str(year) + ":") corpus[year].build_lda(num_topics=num_issues) corpus[year].issue_model = IssueModel(corpus=corpus[year], model=corpus[year].lda) corpus[year].issue_model.build_issues() print("Corpus " + str(year) + " Done\n") # Init Issues (for Issue Tracking) issues = [] for year in years:
def crawl(self, download=True): logger.info('') logger.info('Start crawling %s (%s)' % (self.name, self.url)) # Custom definitions metapath = eval(self.meta_xpath) rules = [item.strip() for item in self.refine_rules.split('\n') if item.strip()] extrapath = [item.strip() for item in self.extra_xpath.split('\n') if item.strip()] proxy = {} if self.proxy: logger.info('Use proxy server: %s' % self.proxy.address) proxy = self.proxy.get_dict() else: proxy = None if self.user_agent: logger.info('Use user agent: %s' % self.user_agent.name) ua = self.user_agent.value else: ua = None extractor = Extractor(self.url, settings.CRAWL_ROOT, proxies=proxy, user_agent=ua) all_links = extractor.extract_links( xpath=self.link_xpath, expand_rules=self.expand_rules.split('\n'), depth=self.crawl_depth) logger.info('%d link(s) found' % len(all_links)) if download: blacklist = [] if self.black_words: blacklist = self.black_words.words.split('\n') for link in all_links: try: link_url = link['url'] if LocalContent.objects.filter(url=link_url).count(): logger.info('Bypass %s' % link_url) continue logger.info('Download %s' % link_url) location = datetime.now().strftime('%Y/%m/%d') location = os.path.join(settings.CRAWL_ROOT, location) sub_extr = Extractor(link_url, location, proxy) if self.content_type: base_meta = {'type': self.content_type.name} else: base_meta = None local_path = sub_extr.extract_content( self.content_xpath, with_image=self.download_image, metapath=metapath, extrapath=extrapath, custom_rules=rules, blacklist=blacklist, metadata=base_meta) content = LocalContent(url=link_url, source=self, local_path=local_path) content.save() except: logger.exception('Error when extracting %s' % link['url']) else: return all_links
def humanListCallback(self, humanDepthList): if self.__isImuValid is False or self.__isAngleValid is False: return if self.__controlState == 0: # 控制状态为0表明还未开始记录 if self.__controlSignal is False: return else: print "start recording!" self.__controlState = 1 if self.__controlState == 1: if self.__controlSignal is True: # 记录数据 curData = self.__extract.extractHumanPose( humanDepthList, self.__odm) curData.extend([ self.__angleData.x, self.__angleData.y, self.__angleData.z, self.__imuData.linear_acceleration.x, self.__imuData.linear_acceleration.y, self.__imuData.linear_acceleration.z, self.__imuData.angular_velocity.x, self.__imuData.angular_velocity.y, self.__imuData.angular_velocity.z ]) self.__data.append(curData) nowLocation = Point() nowLocation.x = curData[0] nowLocation.y = curData[1] nowLocation.z = 0 self.__pubLocation.publish(nowLocation) if self.__isAngleInit is False: self.__initAngleData.append(self.__angleData.z) if len(self.__initAngleData) >= 8: self.__isAngleInit = True self.__angleOffset = sum(self.__initAngleData) / len( self.__initAngleData) self.__initAngleData = [] if len(self.__data) >= 11: self.__isPredictionValid = True if len(self.__data) > 11: del self.__data[0] if self.__isPredictionValid is True: # print self.__odm npTrajectory = np.array(self.__data) cur = time.time() npFuture = self.__model.predictFuture( npTrajectory[:, srcIndex]) Weights = np.matmul(T_inv, npFuture) # plotLocationData(npTrajectory.transpose(), npFuture.transpose()) self.__collectData.append([ npTrajectory[:, srcIndex].transpose(), npFuture.transpose() ]) futureTrajectory = FutureTrajectory() for i in range(0, 8): point = Point() point.x = npFuture[i][0] point.y = npFuture[i][1] point.z = 0 futureTrajectory.locations.append(point) for i in range(0, 3): for j in range(0, 2): futureTrajectory.weights.append( Float64(Weights[i][j] / 1000)) futureTrajectory.weights.append( Float64(self.__angleData.z - self.__angleOffset)) print((self.__angleData.z - self.__angleOffset) / 180 * 3.14159) now = time.time() # print( now - cur ) self.__pubFuture.publish(futureTrajectory) else: self.__controlState = 0 # 将数据存成文件并且清空数据 nowTime = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time())) fileName = nowTime + ' Line3' + '.npy' # plotData(self.__data) np.save(fileName, self.__data) self.__data = [] self.__extract = Extractor() print "save as " + fileName self.__isAngleInit = False '''
class DOMHTMLSearchMovieParser(DOMParserBase): """Parse the html page that the IMDb web server shows when the "new search system" is used, for movies.""" _BaseParser = DOMBasicMovieParser _notDirectHitTitle = '<title>imdb title' _titleBuilder = lambda self, x: build_title(x, canonical=True) _linkPrefix = '/title/tt' _attrs = [ Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'info': ".//text()" }, postprocess=lambda x: (analyze_imdbid(x.get('link') or u''), analyze_title(x.get('info') or u'', canonical=1))) ] extractors = [ Extractor(label='search', path="//td[3]/a[starts-with(@href, '/title/tt')]/..", attrs=_attrs) ] def _init(self): self.url = u'' def _reset(self): self.url = u'' def preprocess_string(self, html_string): if self._notDirectHitTitle in html_string[:1024].lower(): if self._linkPrefix == '/title/tt': # Only for movies. html_string = html_string.replace('(TV mini-series)', '(mini)') html_string = _reAKAS.sub('</td>', html_string) return html_string # Direct hit! dbme = self._BaseParser(useModule=self._useModule) res = dbme.parse(html_string, url=self.url) if not res: return u'' res = res['data'] if not (res and res[0]): return u'' link = '%s%s' % (self._linkPrefix, res[0][0]) # # Tries to cope with companies for which links to pro.imdb.com # # are missing. # link = self.url.replace(imdbURL_base[:-1], '') title = self._titleBuilder(res[0][1]) if not (link and title): return u'' link = link.replace('http://pro.imdb.com', '') new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link, title) return new_html def postprocess_data(self, data): if not data.has_key('data'): data['data'] = [] results = getattr(self, 'results', None) if results is not None: data['data'][:] = data['data'][:results] return data def add_refs(self, data): return data
Train = True if version == 1: depth = n * 6 + 2 elif version == 2: depth = n * 9 + 2 # Model name, depth and version model_type = 'ResNet_%s' % (model_key) train_txt_fp = config.get("FILE_PATH", 'train_imagepath_label') train_pt = './train/train/' class_wordembedings_txt_fp = config.get("FILE_PATH", 'class_wordembeddings_reduced_100') label_list_fp = config.get('FILE_PATH', 'lable_list') x_train, y_train, x_test, y_test = Extractor.readTrainDataVersion2( train_txt_fp, train_pt, class_wordembedings_txt_fp, label_list_fp) print(y_train.shape) input_shape = x_train.shape[1:] x_train = x_train.astype('float32') / 255 x_test = x_test.astype('float32') / 255 if subtract_pixel_mean: x_train_mean = np.mean(x_train, axis=0) x_train -= x_train_mean x_test -= x_train_mean print('x_train shape:', x_train.shape) print('y_train shape:', y_train.shape) save_dir = os.path.join(os.getcwd(), 'saved_models') model_name = 'cifar10_%s_model.{epoch:03d}.{val_acc:03f}.h5' % model_type
class DOMHTMLBioParser(DOMParserBase): """Parser for the "biography" page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: bioparser = DOMHTMLBioParser() result = bioparser.parse(biography_html_string) """ _defGetRefs = True _birth_attrs = [Attribute(key='birth date', path={ 'day': "./a[starts-with(@href, " \ "'/search/name?birth_monthday=')]/text()", 'year': "./a[starts-with(@href, " \ "'/search/name?birth_year=')]/text()" }, postprocess=build_date), Attribute(key='birth notes', path="./a[starts-with(@href, " \ "'/search/name?birth_place=')]/text()")] _death_attrs = [Attribute(key='death date', path={ 'day': "./a[starts-with(@href, " \ "'/search/name?death_monthday=')]/text()", 'year': "./a[starts-with(@href, " \ "'/search/name?death_date=')]/text()" }, postprocess=build_date), Attribute(key='death notes', path="./text()", # TODO: check if this slicing is always correct postprocess=lambda x: u''.join(x).strip()[2:])] extractors = [ Extractor(label='headshot', path="//a[@name='headshot']", attrs=Attribute(key='headshot', path="./img/@src")), Extractor(label='birth info', path="//table[@id='overviewTable']//td[text()='Date of Birth']/following-sibling::td[1]", attrs=_birth_attrs), Extractor(label='death info', path="//table[@id='overviewTable']//td[text()='Date of Death']/following-sibling::td[1]", attrs=_death_attrs), Extractor(label='nick names', path="//table[@id='overviewTable']//td[text()='Nickenames']/following-sibling::td[1]", attrs=Attribute(key='nick names', path="./text()", joiner='|', postprocess=lambda x: [n.strip().replace(' (', '::(', 1) for n in x.split('|') if n.strip()])), Extractor(label='birth name', path="//table[@id='overviewTable']//td[text()='Birth Name']/following-sibling::td[1]", attrs=Attribute(key='birth name', path="./text()", postprocess=lambda x: canonicalName(x.strip()))), Extractor(label='height', path="//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]", attrs=Attribute(key='height', path="./text()", postprocess=lambda x: x.strip())), Extractor(label='mini biography', path="//a[@name='mini_bio']/following-sibling::div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]", attrs=Attribute(key='mini biography', multi=True, path={ 'bio': ".//text()", 'by': ".//a[@name='ba']//text()" }, postprocess=lambda x: "%s::%s" % \ ((x.get('bio') or u'').split('- IMDb Mini Biography By:')[0].strip(), (x.get('by') or u'').strip() or u'Anonymous'))), Extractor(label='spouse', path="//div[h5='Spouse']/table/tr", attrs=Attribute(key='spouse', multi=True, path={ 'name': "./td[1]//text()", 'info': "./td[2]//text()" }, postprocess=lambda x: ("%s::%s" % \ (x.get('name').strip(), (x.get('info') or u'').strip())).strip(':'))), Extractor(label='trade mark', path="//div[h5='Trade Mark']/p", attrs=Attribute(key='trade mark', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='trivia', path="//div[h5='Trivia']/p", attrs=Attribute(key='trivia', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='quotes', path="//div[h5='Personal Quotes']/p", attrs=Attribute(key='quotes', multi=True, path=".//text()", postprocess=lambda x: x.strip())), Extractor(label='salary', path="//div[h5='Salary']/table/tr", attrs=Attribute(key='salary history', multi=True, path={ 'title': "./td[1]//text()", 'info': "./td[2]/text()", }, postprocess=lambda x: "%s::%s" % \ (x.get('title').strip(), x.get('info').strip()))), Extractor(label='where now', path="//div[h5='Where Are They Now']/p", attrs=Attribute(key='where now', multi=True, path=".//text()", postprocess=lambda x: x.strip())), ] preprocessors = [(re.compile('(<h5>)', re.I), r'</div><div class="_imdbpy">\1'), (re.compile('(</table>\n</div>\s+)</div>', re.I + re.DOTALL), r'\1'), (re.compile('(<div id="tn15bot">)'), r'</div>\1'), (re.compile('\.<br><br>([^\s])', re.I), r'. \1')] def postprocess_data(self, data): for what in 'birth date', 'death date': if what in data and not data[what]: del data[what] return data
""" Author: Sulley Date: 2020.2.29 """ import chardet import codecs import os import sys import csv import xlrd import docx import jieba import itertools, string from pypinyin import pinyin, lazy_pinyin, Style from PyQt5.Qt import * from utils import Converter, Counter, Extractor, Corpus, Lexicon from window import Window, EmittingStream if __name__ == '__main__': converter = Converter() counter = Counter(converter) extractor = Extractor(converter) corpus = Corpus() lexicon = Lexicon() app = QApplication(sys.argv) exe = Window(converter, counter, extractor, corpus, lexicon) sys.exit(app.exec_())
class DOMHTMLMaindetailsParser(DOMParserBase): """Parser for the "categorized" (maindetails) page of a given person. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: cparser = DOMHTMLMaindetailsParser() result = cparser.parse(categorized_html_string) """ _containsObjects = True _name_imdb_index = re.compile(r'\([IVXLCDM]+\)') _birth_attrs = [Attribute(key='birth date', path='.//time[@itemprop="birthDate"]/@datetime'), Attribute(key='birth place', path=".//a[starts-with(@href, " \ "'/search/name?birth_place=')]/text()")] _death_attrs = [Attribute(key='death date', path='.//time[@itemprop="deathDate"]/@datetime'), Attribute(key='death place', path=".//a[starts-with(@href, " \ "'/search/name?death_place=')]/text()")] _film_attrs = [ Attribute(key=None, multi=True, path={ 'link': "./b/a[1]/@href", 'title': "./b/a[1]/text()", 'notes': "./b/following-sibling::text()", 'year': "./span[@class='year_column']/text()", 'status': "./a[@class='in_production']/text()", 'rolesNoChar': './/br/following-sibling::text()', 'chrRoles': "./a[@imdbpyname]/@imdbpyname", 'roleID': "./a[starts-with(@href, '/character/')]/@href" }, postprocess=lambda x: build_movie( x.get('title') or u'', year=x.get('year'), movieID=analyze_imdbid(x.get('link') or u''), rolesNoChar=(x.get('rolesNoChar') or u'').strip(), chrRoles=(x.get('chrRoles') or u'').strip(), additionalNotes=x.get('notes'), roleID=(x.get('roleID') or u''), status=x.get('status') or None)) ] extractors = [ Extractor(label='name', path="//h1[@class='header']", attrs=Attribute(key='name', path=".//text()", postprocess=lambda x: analyze_name(x, canonical=1))), Extractor(label='name_index', path="//h1[@class='header']/span[1]", attrs=Attribute(key='name_index', path="./text()")), Extractor(label='birth info', path="//div[h4='Born:']", attrs=_birth_attrs), Extractor(label='death info', path="//div[h4='Died:']", attrs=_death_attrs), Extractor(label='headshot', path="//td[@id='img_primary']/div[@class='image']/a", attrs=Attribute(key='headshot', path="./img/@src")), Extractor(label='akas', path="//div[h4='Alternate Names:']", attrs=Attribute(key='akas', path="./text()", postprocess=lambda x: x.strip().split(' '))), Extractor(label='filmography', group="//div[starts-with(@id, 'filmo-head-')]", group_key="./a[@name]/text()", group_key_normalize=lambda x: x.lower().replace(': ', ' '), path="./following-sibling::div[1]" \ "/div[starts-with(@class, 'filmo-row')]", attrs=_film_attrs), Extractor(label='indevelopment', path="//div[starts-with(@class,'devitem')]", attrs=Attribute(key='in development', multi=True, path={ 'link': './a/@href', 'title': './a/text()' }, postprocess=lambda x: build_movie(x.get('title') or u'', movieID=analyze_imdbid(x.get('link') or u''), roleID=(x.get('roleID') or u'').split('/'), status=x.get('status') or None))) ] preprocessors = [ ('<div class="clear"/> </div>', ''), ('<br/>', '<br />'), (re.compile(r'(<a href="/character/ch[0-9]{7}")>(.*?)</a>'), r'\1 imdbpyname="\2@@">\2</a>') ] def postprocess_data(self, data): for what in 'birth date', 'death date': if what in data and not data[what]: del data[what] name_index = (data.get('name_index') or '').strip() if name_index: if self._name_imdb_index.match(name_index): data['imdbIndex'] = name_index[1:-1] del data['name_index'] # XXX: the code below is for backwards compatibility # probably could be removed for key in data.keys(): if key.startswith('actor '): if not data.has_key('actor'): data['actor'] = [] data['actor'].extend(data[key]) del data[key] if key.startswith('actress '): if not data.has_key('actress'): data['actress'] = [] data['actress'].extend(data[key]) del data[key] if key.startswith('self '): if not data.has_key('self'): data['self'] = [] data['self'].extend(data[key]) del data[key] if key == 'birth place': data['birth notes'] = data[key] del data[key] if key == 'death place': data['death notes'] = data[key] del data[key] return data
model_save_fp = config.get("MODEL", 'model_save_fp') train_label_20_fp = config.get('FILE_PATH', 'train_label_20') train_lable_fp = config.get('MODEL', 'train_lable') train_imageName_Lable_fp = config.get("MODEL", 'train_imageName_Lable_fp') image_path = config.get("MODEL", 'image_path') if version == 1: depth = n * 6 + 2 elif version == 2: depth = n * 9 + 2 model_type = 'ResNet_%s' % (model_key) save_dir = os.path.join(config.get('MODEL', 'data_pre_pt'), 'saved_models') x_train, y_train, x_test, y_test, train_cate_num = Extractor.gainTrainAndTest(train_lable_fp, train_imageName_Lable_fp, image_path) num_classes = train_cate_num input_shape = x_train.shape[1:] x_train = x_train.astype('float32') / 255 x_test = x_test.astype('float32') / 255 if subtract_pixel_mean: x_train_mean = np.mean(x_train, axis=0) x_train -= x_train_mean x_test -= x_train_mean print('x_train shape:', x_train.shape) print('y_train shape:', y_train.shape) model_name = 'cifar10_%s_model.{epoch:03d}.{val_acc:03f}.h5' % model_type if not os.path.isdir(save_dir): os.makedirs(save_dir)
class DOMHTMLSearchMovieParser(DOMParserBase): """Parse the html page that the IMDb web server shows when the "new search system" is used, for movies.""" _BaseParser = DOMBasicMovieParser _notDirectHitTitle = '<title>find - imdb</title>' _titleBuilder = lambda self, x: build_title(x) _linkPrefix = '/title/tt' _attrs = [ Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'info': ".//text()", 'akas': "./i//text()" }, postprocess=lambda x: (analyze_imdbid(x.get('link') or u''), custom_analyze_title(x.get('info') or u''), x.get('akas'))) ] extractors = [ Extractor(label='search', path="//td[@class='result_text']", attrs=_attrs) ] def _init(self): self.url = u'' def _reset(self): self.url = u'' def preprocess_string(self, html_string): if self._notDirectHitTitle in html_string[:10240].lower(): if self._linkPrefix == '/title/tt': # Only for movies. # XXX (HTU): does this still apply? html_string = html_string.replace('(TV mini-series)', '(mini)') return html_string # Direct hit! dbme = self._BaseParser(useModule=self._useModule) res = dbme.parse(html_string, url=self.url) if not res: return u'' res = res['data'] if not (res and res[0]): return u'' link = '%s%s' % (self._linkPrefix, res[0][0]) # # Tries to cope with companies for which links to pro.imdb.com # # are missing. # link = self.url.replace(imdbURL_base[:-1], '') title = self._titleBuilder(res[0][1]) if not (link and title): return u'' link = link.replace('http://pro.imdb.com', '') new_html = '<td class="result_text"><a href="%s">%s</a></td>' % (link, title) return new_html def postprocess_data(self, data): if not data.has_key('data'): data['data'] = [] results = getattr(self, 'results', None) if results is not None: data['data'][:] = data['data'][:results] # Horrible hack to support AKAs. if data and data['data'] and len(data['data'][0]) == 3 and \ isinstance(data['data'][0], tuple): data['data'] = [x for x in data['data'] if x[0] and x[1]] for idx, datum in enumerate(data['data']): if not isinstance(datum, tuple): continue if not datum[0] and datum[1]: continue if datum[2] is not None: #akas = filter(None, datum[2].split('::')) if self._linkPrefix == '/title/tt': # XXX (HTU): couldn't find a result with multiple akas aka = datum[2] akas = [aka[1:-1]] # remove the quotes #akas = [a.replace('" - ', '::').rstrip() for a in akas] #akas = [a.replace('aka "', '', 1).replace('aka "', #'', 1).lstrip() for a in akas] datum[1]['akas'] = akas data['data'][idx] = (datum[0], datum[1]) else: data['data'][idx] = (datum[0], datum[1]) return data def add_refs(self, data): return data
model = load_model(mode_fp) model.summary() if config.getboolean('EXTRACT_IMAGE_FEATURE', 'extract_all_train'): print('getting all train fc vector...') model_key = config.get('EXTRACT_IMAGE_FEATURE', 'model_key') model_path = config.get("EXTRACT_IMAGE_FEATURE", 'model_path') if os.path.exists(model_path) == False: os.mkdir(model_path) save_fp = model_path + config.get('EXTRACT_IMAGE_FEATURE', 'fc_vector_alltrain') train_lable_fp = config.get('EXTRACT_IMAGE_FEATURE', 'train_lable_fp') train_imageName_Lable_fp = config.get("EXTRACT_IMAGE_FEATURE", 'train_imageName_Lable_fp') image_path_train = config.get("EXTRACT_IMAGE_FEATURE", 'image_path_train') X, Y = Extractor.gainVal(train_lable_fp, train_imageName_Lable_fp, image_path_train) print("total x:", X.shape[0]) X = X.astype('float32') / 255 x_train, y_train, x_test, y_test, train_cate_num = Extractor.gainTrainAndTest( train_lable_fp, train_imageName_Lable_fp, image_path_train) x_train = x_train.astype('float32') / 255 x_train_mean = np.mean(x_train, axis=0) X -= x_train_mean pre_y, pre_fc = model.predict(X, verbose=1) #print(MyFunction.computeAcc(Y, pre_y)) MyFunction.saveFcLayer(pre_y, Y, pre_fc, save_fp, train_lable_fp) if config.getboolean('EXTRACT_IMAGE_FEATURE', 'extract_val'): print("getting val fc vector...") model_key = config.get('MODEL', 'model_key') train_lable = config.get('MODEL', 'train_lable')