Exemple #1
0
def process_source(src):
    """
    Parse the source according to the given schema,
    create the list of dicts with records of Authorities.
    """
    schema= define_schema(AuthorityProfile)

    if VERBOSE: print "Extracting the data from the source CSV file."
    try:
        authorities= EX.extract(src, schema)
        if VERBOSE: print "Extracting complete successfully."
    except Exception as e:
        print e
        return False

    if VERBOSE: print "Processing data from the obtained dataset."
    try:
        authorities= process_extracted_data(authorities)
    except Exception as e:
        print e
        return False

    if VERBOSE: print "Processing complete."
    return True
class DOMHTMLPersonGenresParser(DOMParserBase):
    """Parser for the "by genre" and "by keywords" pages of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        gparser = DOMHTMLPersonGenresParser()
        result = gparser.parse(bygenre_html_string)
    """
    kind = 'genres'
    _containsObjects = True

    extractors = [
            Extractor(label='genres',
                        group="//b/a[@name]/following-sibling::a[1]",
                        group_key="./text()",
                        group_key_normalize=lambda x: x.lower(),
                        path="../../following-sibling::ol[1]/li//a[1]",
                        attrs=Attribute(key=None,
                            multi=True,
                            path={
                                'link': "./@href",
                                'title': "./text()",
                                'info': "./following-sibling::text()"
                                },
                            postprocess=lambda x: \
                                    build_movie(x.get('title') + \
                                    x.get('info').split('[')[0],
                                    analyze_imdbid(x.get('link')))))
            ]

    def postprocess_data(self, data):
        if len(data) == 0:
            return {}
        return {self.kind: data}
Exemple #3
0
class DOMHTMLSearchMovieParser(DOMParserBase):
    """Parse the html page that the IMDb web server shows when the
    "new search system" is used, for movies."""

    _BaseParser = DOMBasicMovieParser
    _notDirectHitTitle = '<title>imdb title'
    _titleBuilder = lambda self, x: build_title(x)
    _linkPrefix = '/title/tt'

    _attrs = [Attribute(key='data',
                        multi=True,
                        path={
                            'link': "./a[1]/@href",
                            'info': ".//text()",
                            #'akas': ".//div[@class='_imdbpyAKA']//text()"
                            'akas': ".//p[@class='find-aka']//text()"
                            },
                        postprocess=lambda x: (
                            analyze_imdbid(x.get('link') or u''),
                            custom_analyze_title(x.get('info') or u''),
                            x.get('akas')
                        ))]
    extractors = [Extractor(label='search',
                        path="//td[3]/a[starts-with(@href, '/title/tt')]/..",
                        attrs=_attrs)]
    def _init(self):
        self.url = u''

    def _reset(self):
        self.url = u''

    def preprocess_string(self, html_string):
        if self._notDirectHitTitle in html_string[:1024].lower():
            if self._linkPrefix == '/title/tt':
                # Only for movies.
                html_string = html_string.replace('(TV mini-series)', '(mini)')
                html_string = html_string.replace('<p class="find-aka">',
                        '<p class="find-aka">::')
                #html_string = _reAKAStitles.sub(
                #        r'<div class="_imdbpyAKA">\1::</div>\2', html_string)
            return html_string
        # Direct hit!
        dbme = self._BaseParser(useModule=self._useModule)
        res = dbme.parse(html_string, url=self.url)
        if not res: return u''
        res = res['data']
        if not (res and res[0]): return u''
        link = '%s%s' % (self._linkPrefix, res[0][0])
        #    # Tries to cope with companies for which links to pro.imdb.com
        #    # are missing.
        #    link = self.url.replace(imdbURL_base[:-1], '')
        title = self._titleBuilder(res[0][1])
        if not (link and title): return u''
        link = link.replace('http://pro.imdb.com', '')
        new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link,
                                                                    title)
        return new_html

    def postprocess_data(self, data):
        if not data.has_key('data'):
            data['data'] = []
        results = getattr(self, 'results', None)
        if results is not None:
            data['data'][:] = data['data'][:results]
        # Horrible hack to support AKAs.
        if data and data['data'] and len(data['data'][0]) == 3 and \
                isinstance(data['data'][0], tuple):
            for idx, datum in enumerate(data['data']):
                if datum[2] is not None:
                    akas = filter(None, datum[2].split('::'))
                    if self._linkPrefix == '/title/tt':
                        akas = [a.replace('" - ', '::').rstrip() for a in akas]
                        akas = [a.replace('aka "', '', 1).lstrip() for a in akas]
                    datum[1]['akas'] = akas
                    data['data'][idx] = (datum[0], datum[1])
                else:
                    data['data'][idx] = (datum[0], datum[1])
        return data

    def add_refs(self, data):
        return data
class DOMHTMLBioParser(DOMParserBase):
    """Parser for the "biography" page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        bioparser = DOMHTMLBioParser()
        result = bioparser.parse(biography_html_string)
    """
    _defGetRefs = True

    _birth_attrs = [Attribute(key='birth date',
                        path={
                            'day': "./a[starts-with(@href, " \
                                    "'/OnThisDay?')]/text()",
                            'year': "./a[starts-with(@href, " \
                                    "'/BornInYear?')]/text()"
                            },
                        postprocess=lambda x: build_date(x)),
                    Attribute(key='birth notes',
                        path="./a[starts-with(@href, '/BornWhere?')]/text()")]
    _death_attrs = [Attribute(key='death date',
                        path={
                            'day': "./a[starts-with(@href, " \
                                    "'/OnThisDay?')]/text()",
                            'year': "./a[starts-with(@href, " \
                                    "'/DiedInYear?')]/text()"
                            },
                        postprocess=lambda x: build_date(x)),
                    Attribute(key='death notes',
                        path="./text()",
                        # TODO: check if this slicing is always correct
                        postprocess=lambda x: u''.join(x).strip()[2:])]
    extractors = [
            Extractor(label='birth info',
                        path="//div[h5='Date of Birth']",
                        attrs=_birth_attrs),
            Extractor(label='death info',
                        path="//div[h5='Date of Death']",
                        attrs=_death_attrs),
            Extractor(label='nick names',
                        path="//div[h5='Nickname']",
                        attrs=Attribute(key='nick names',
                            path="./text()",
                            joiner='|',
                            postprocess=lambda x: [n.strip().replace(' (',
                                    '::(', 1) for n in x.split('|')
                                    if n.strip()])),
            Extractor(label='birth name',
                        path="//div[h5='Birth Name']",
                        attrs=Attribute(key='birth name',
                            path="./text()",
                            postprocess=lambda x: canonicalName(x.strip()))),
            Extractor(label='height',
                        path="//div[h5='Height']",
                        attrs=Attribute(key='height',
                            path="./text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='mini biography',
                        path="//div[h5='Mini Biography']",
                        attrs=Attribute(key='mini biography',
                            multi=True,
                            path={
                                'bio': "./p//text()",
                                'by': "./b/following-sibling::a/text()"
                                },
                            postprocess=lambda x: "%s::%s" % \
                                (x.get('bio').strip(),
                                (x.get('by') or u'').strip() or u'Anonymous'))),
            Extractor(label='spouse',
                        path="//div[h5='Spouse']/table/tr",
                        attrs=Attribute(key='spouse',
                            multi=True,
                            path={
                                'name': "./td[1]//text()",
                                'info': "./td[2]//text()"
                                },
                            postprocess=lambda x: "%s::%s" % \
                                            (x.get('name').strip(),
                                                x.get('info').strip()))),
            Extractor(label='trade mark',
                        path="//div[h5='Trade Mark']/p",
                        attrs=Attribute(key='trade mark',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='trivia',
                        path="//div[h5='Trivia']/p",
                        attrs=Attribute(key='trivia',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='quotes',
                        path="//div[h5='Personal Quotes']/p",
                        attrs=Attribute(key='quotes',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='salary',
                        path="//div[h5='Salary']/table/tr",
                        attrs=Attribute(key='salary history',
                            multi=True,
                            path={
                                'title': "./td[1]//text()",
                                'info': "./td[2]/text()",
                                },
                            postprocess=lambda x: "%s::%s" % \
                                    (x.get('title').strip(),
                                        x.get('info').strip()))),
            Extractor(label='where now',
                        path="//div[h5='Where Are They Now']/p",
                        attrs=Attribute(key='where now',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            ]

    preprocessors = [(re.compile('(<h5>)',
                                 re.I), r'</div><div class="_imdbpy">\1'),
                     (re.compile('(</table>\n</div>\s+)</div>',
                                 re.I + re.DOTALL), r'\1'),
                     (re.compile('(<div id="tn15bot">)'), r'</div>\1'),
                     (re.compile('\.<br><br>([^\s])', re.I), r'. \1')]
class DOMHTMLMaindetailsParser(DOMParserBase):
    """Parser for the "categorized" (maindetails) page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        cparser = DOMHTMLMaindetailsParser()
        result = cparser.parse(categorized_html_string)
    """
    _containsObjects = True

    _birth_attrs = [Attribute(key='birth date',
                        path={
                            'day': "./a[starts-with(@href, " \
                                    "'/OnThisDay?')]/text()",
                            'year': "./a[starts-with(@href, " \
                                    "'/BornInYear?')]/text()"
                            },
                        postprocess=lambda x: build_date(x)),
                    Attribute(key='birth notes',
                        path="./a[starts-with(@href, '/BornWhere?')]/text()")]
    _death_attrs = [Attribute(key='death date',
                        path={
                            'day': "./a[starts-with(@href, " \
                                    "'/OnThisDay?')]/text()",
                            'year': "./a[starts-with(@href, " \
                                    "'/DiedInYear?')]/text()"
                            },
                        postprocess=lambda x: build_date(x)),
                    Attribute(key='death notes',
                        path="./text()",
                        # TODO: check if this slicing is always correct
                        postprocess=lambda x: x.strip()[2:])]
    _film_attrs = [
        Attribute(key=None,
                  multi=True,
                  path={
                      'link': "./a[1]/@href",
                      'title': ".//text()",
                      'status': "./i/a//text()",
                      'roleID': "./div[@class='_imdbpyrole']/@roleid"
                  },
                  postprocess=lambda x: build_movie(
                      x.get('title') or u'',
                      movieID=analyze_imdbid(x.get('link') or u''),
                      roleID=(x.get('roleID') or u'').split('/'),
                      status=x.get('status') or None))
    ]

    extractors = [
        Extractor(label='page title',
                  path="//title",
                  attrs=Attribute(
                      key='name',
                      path="./text()",
                      postprocess=lambda x: analyze_name(x, canonical=1))),
        Extractor(label='birth info',
                  path="//div[h5='Date of Birth:']",
                  attrs=_birth_attrs),
        Extractor(label='death info',
                  path="//div[h5='Date of Death:']",
                  attrs=_death_attrs),
        Extractor(label='headshot',
                  path="//a[@name='headshot']",
                  attrs=Attribute(key='headshot', path="./img/@src")),
        Extractor(label='akas',
                  path="//div[h5='Alternate Names:']",
                  attrs=Attribute(
                      key='akas',
                      path="./text()",
                      postprocess=lambda x: x.strip().split(' | '))),
        Extractor(label='filmography',
                  group="//div[@class='filmo'][h5]",
                  group_key="./h5/a[@name]/text()",
                  group_key_normalize=lambda x: x.lower()[:-1],
                  path="./ol/li",
                  attrs=_film_attrs)
    ]
    preprocessors = [
        # XXX: check that this doesn't cut "status" or other info...
        (re.compile(r'<br>(\.\.\.|    ?).+?</li>',
                    re.I | re.M | re.S), '</li>'),
        (_reRoles, _manageRoles)
    ]
class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser):
    """Parser for the "biography" page of a given character.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        bparser = DOMHTMLCharacterMaindetailsParser()
        result = bparser.parse(character_biography_html_string)
    """
    _containsObjects = True

    _film_attrs = [Attribute(key=None,
                      multi=True,
                      path={
                          'link': "./a[1]/@href",
                          'title': ".//text()",
                          'status': "./i/a//text()",
                          'roleID': "./a/@href"
                          },
                      postprocess=lambda x:
                          build_movie(x.get('title') or u'',
                              movieID=analyze_imdbid(x.get('link') or u''),
                              roleID=_personIDs.findall(x.get('roleID') or u''),
                              status=x.get('status') or None,
                              _parsingCharacter=True))]

    extractors = [
            Extractor(label='title',
                        path="//title",
                        attrs=Attribute(key='name',
                            path="./text()",
                            postprocess=lambda x: \
                                    x.replace(' (Character)', '').strip())),

            Extractor(label='headshot',
                        path="//a[@name='headshot']",
                        attrs=Attribute(key='headshot',
                            path="./img/@src")),

            Extractor(label='akas',
                        path="//div[h5='Alternate Names:']",
                        attrs=Attribute(key='akas',
                            path="./text()",
                            postprocess=lambda x: x.strip().split(' / '))),

            Extractor(label='filmography',
                        path="//div[@class='filmo'][not(h5)]/ol/li",
                        attrs=_film_attrs),

            Extractor(label='filmography sections',
                        group="//div[@class='filmo'][h5]",
                        group_key="./h5/a/text()",
                        group_key_normalize=lambda x: x.lower()[:-1],
                        path="./ol/li",
                        attrs=_film_attrs),
            ]

    preprocessors = [
            # Check that this doesn't cut "status"...
            (re.compile(r'<br>(\.\.\.|   ).+?</li>', re.I | re.M), '</li>')]
class DOMHTMLResumeParser(DOMParserBase):
    """Parser for the "resume" page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        resumeparser = DOMHTMLResumeParser()
        result = resumeparser.parse(resume_html_string)
    """
    _defGetRefs = True

    extractors = [
        Extractor(
            label='info',
            group="//div[@class='section_box']",
            group_key="./h3/text()",
            group_key_normalize=lambda x: x.lower().replace(' ', '_'),
            path="./ul[@class='resume_section_multi_list']//li",
            attrs=Attribute(
                key=None,
                multi=True,
                path={
                    'title': ".//b//text()",
                    'desc': ".//text()",
                },
                postprocess=lambda x:
                (x.get('title'), x.get('desc').strip().replace('\n', ' ')))),
        Extractor(label='other_info',
                  group="//div[@class='section_box']",
                  group_key="./h3/text()",
                  group_key_normalize=lambda x: x.lower().replace(' ', '_'),
                  path="./ul[@class='_imdbpy']//li",
                  attrs=Attribute(
                      key=None,
                      multi=True,
                      path=".//text()",
                      postprocess=lambda x: x.strip().replace('\n', ' '))),
        Extractor(
            label='credits',
            group="//div[@class='section_box']",
            group_key="./h3/text()",
            group_key_normalize=lambda x: x.lower().replace(' ', '_'),
            path="./table[@class='credits']//tr",
            attrs=Attribute(
                key=None,
                multi=True,
                path={
                    '0': ".//td[1]//text()",
                    '1': ".//td[2]//text()",
                    '2': ".//td[3]//text()",
                },
                postprocess=lambda x: [x.get(
                    '0'), x.get('1'), x.get('2')])),
        Extractor(label='mini_info',
                  path="//div[@class='center']",
                  attrs=Attribute(
                      key='mini_info',
                      path=".//text()",
                      postprocess=lambda x: x.strip().replace('\n', ' '))),
        Extractor(label='name',
                  path="//div[@class='center']/h1[@id='preview_user_name']",
                  attrs=Attribute(
                      key='name',
                      path=".//text()",
                      postprocess=lambda x: x.strip().replace('\n', ' '))),
        Extractor(label='resume_bio',
                  path="//div[@id='resume_rendered_html']//p",
                  attrs=Attribute(key='resume_bio',
                                  multi=True,
                                  path=".//text()")),
    ]

    preprocessors = [
        (re.compile('(<ul>)', re.I), r'<ul class="_imdbpy">\1'),
    ]

    def postprocess_data(self, data):

        for key in data.keys():
            if data[key] == '':
                del data[key]
            if key in ('mini_info', 'name', 'resume_bio'):
                if key == 'resume_bio':
                    data[key] = "".join(data[key]).strip()
                continue
            if len(data[key][0]) == 3:
                for item in data[key]:
                    item[:] = [x for x in item if not x is None]
                continue

            if len(data[key][0]) == 2:
                new_key = {}
                for item in data[key]:
                    if item[0] is None:
                        continue
                    if ':' in item[0]:
                        if item[1].replace(item[0], '')[1:].strip() == '':
                            continue
                        new_key[item[0].strip().replace(':',
                                                        '')] = item[1].replace(
                                                            item[0],
                                                            '')[1:].strip()
                    else:
                        new_key[item[0]] = item[1]
                data[key] = new_key

        new_data = {'resume': data}
        return new_data
Exemple #8
0
    def crawl(self, download=True):
        logger.info('')
        logger.info('Start crawling %s (%s)' % (self.name, self.url))

        # Custom definitions
        metapath = eval(self.meta_xpath) if self.meta_xpath else None
        expand_rules = self.expand_rules.split('\n') \
            if self.expand_rules else None
        refine_rules = [item.strip() for item in self.refine_rules.split('\n')
                        if item.strip()]
        extrapath = [item.strip() for item in self.extra_xpath.split('\n')
                     if item.strip()]

        proxy = self.proxy.get_dict() if self.proxy else None
        logger.info('Use proxy server: %s' % self.proxy)

        ua = self.user_agent.value if self.user_agent else None
        logger.info('Use user agent: %s' % self.user_agent)

        # Initialize extractor
        self._extractor = Extractor(self.url, settings.CRAWL_ROOT,
                                    proxies=proxy, user_agent=ua)
        make_root = False
        if self.link_xpath.startswith('/+'):
            make_root = True
            self.link_xpath = self.link_xpath[2:]

        all_links = self._extractor.extract_links(
            xpath=self.link_xpath,
            expand_rules=expand_rules,
            depth=self.crawl_depth,
            make_root=make_root)
        logger.info('%d link(s) found' % len(all_links))

        # Just dry running or real download
        if download:
            blacklist = []
            local_content = []
            if self.black_words:
                blacklist = self.black_words.words.split('\n')
            for link in all_links:
                try:
                    link_url = link['url']
                    if LocalContent.objects.filter(url=link_url).count():
                        logger.info('Bypass %s' % link_url)
                        continue
                    logger.info('Download %s' % link_url)
                    location = datetime.now().strftime('%Y/%m/%d')
                    location = os.path.join(settings.CRAWL_ROOT, location)
                    sub_extr = Extractor(link_url, location, proxy)
                    if self.content_type:
                        base_meta = {'type': self.content_type.name}
                    else:
                        base_meta = None
                    local_path = sub_extr.extract_content(
                        self.content_xpath,
                        with_image=self.download_image,
                        metapath=metapath,
                        extrapath=extrapath,
                        custom_rules=refine_rules,
                        blacklist=blacklist,
                        metadata=base_meta)
                    content = LocalContent(url=link_url, source=self,
                                           local_path=local_path)
                    content.save()
                    local_content.append(content)
                except:
                    logger.exception('Error when extracting %s' % link['url'])
            paths = [lc.local_path for lc in local_content]
            return paths
        else:
            return all_links
Exemple #9
0
class Prediction:
    def __init__(self):
        rospy.init_node("onlinePrediction", anonymous=True)
        self.__controlSignal = False
        self.__controlState = 0
        self.__isAngleValid = False
        self.__isImuValid = False
        self.__extract = Extractor()
        self.__data = []
        self.__isPredictionValid = False
        self.__model = Model()
        self.__collectData = []
        self.__isAngleInit = False
        self.__initAngleData = []
        self.__angleOffset = 0
        self.__pubFuture = rospy.Publisher("predicted_trajectory",
                                           FutureTrajectory,
                                           queue_size=4)
        self.__pubLocation = rospy.Publisher("nowLocation",
                                             Point,
                                             queue_size=4)
        rospy.Subscriber("robot_odm", Pose2D, self.odmCallback, queue_size=4)
        rospy.Subscriber("openpose_ros/human_depth_list",
                         HumanDepthList,
                         self.humanListCallback,
                         queue_size=50)
        rospy.Subscriber("/robot/controlSignal", Bool,
                         self.controlSignalCallback)
        rospy.Subscriber("/imu", Imu, self.imuCallback, queue_size=3)
        rospy.Subscriber("/imu_angle", Vector3, self.angleCallback)

    def odmCallback(self, odm):
        self.__odm = odm

    def controlSignalCallback(self, controlSignal):
        self.__controlSignal = controlSignal.data

    def humanListCallback(self, humanDepthList):
        if self.__isImuValid is False or self.__isAngleValid is False:
            return
        if self.__controlState == 0:
            # 控制状态为0表明还未开始记录
            if self.__controlSignal is False:
                return
            else:
                print "start recording!"
                self.__controlState = 1
        if self.__controlState == 1:
            if self.__controlSignal is True:
                # 记录数据
                curData = self.__extract.extractHumanPose(
                    humanDepthList, self.__odm)
                curData.extend([
                    self.__angleData.x, self.__angleData.y, self.__angleData.z,
                    self.__imuData.linear_acceleration.x,
                    self.__imuData.linear_acceleration.y,
                    self.__imuData.linear_acceleration.z,
                    self.__imuData.angular_velocity.x,
                    self.__imuData.angular_velocity.y,
                    self.__imuData.angular_velocity.z
                ])
                self.__data.append(curData)
                nowLocation = Point()
                nowLocation.x = curData[0]
                nowLocation.y = curData[1]
                nowLocation.z = 0
                self.__pubLocation.publish(nowLocation)
                if self.__isAngleInit is False:
                    self.__initAngleData.append(self.__angleData.z)
                    if len(self.__initAngleData) >= 8:
                        self.__isAngleInit = True
                        self.__angleOffset = sum(self.__initAngleData) / len(
                            self.__initAngleData)
                        self.__initAngleData = []
                if len(self.__data) >= 11:
                    self.__isPredictionValid = True
                if len(self.__data) > 11:
                    del self.__data[0]
                if self.__isPredictionValid is True:
                    # print self.__odm
                    npTrajectory = np.array(self.__data)
                    cur = time.time()
                    npFuture = self.__model.predictFuture(
                        npTrajectory[:, srcIndex])
                    Weights = np.matmul(T_inv, npFuture)
                    # plotLocationData(npTrajectory.transpose(), npFuture.transpose())
                    self.__collectData.append([
                        npTrajectory[:, srcIndex].transpose(),
                        npFuture.transpose()
                    ])
                    futureTrajectory = FutureTrajectory()
                    for i in range(0, 8):
                        point = Point()
                        point.x = npFuture[i][0]
                        point.y = npFuture[i][1]
                        point.z = 0
                        futureTrajectory.locations.append(point)
                    for i in range(0, 3):
                        for j in range(0, 2):
                            futureTrajectory.weights.append(
                                Float64(Weights[i][j] / 1000))
                    futureTrajectory.weights.append(
                        Float64(self.__angleData.z - self.__angleOffset))
                    print((self.__angleData.z - self.__angleOffset) / 180 *
                          3.14159)
                    now = time.time()
                    # print( now - cur )
                    self.__pubFuture.publish(futureTrajectory)

            else:
                self.__controlState = 0
                # 将数据存成文件并且清空数据
                nowTime = time.strftime("%Y-%m-%d-%H_%M_%S",
                                        time.localtime(time.time()))
                fileName = nowTime + ' Line3' + '.npy'
                # plotData(self.__data)
                np.save(fileName, self.__data)
                self.__data = []
                self.__extract = Extractor()
                print "save as " + fileName
                self.__isAngleInit = False
                '''
                for i in range(len(self.__collectData)):
                    if i % 5 == 0:
                        plotLocationData(self.__collectData[i][0], self.__collectData[i][1])
                '''

    def imuCallback(self, imuData):
        self.__isImuValid = True
        self.__imuData = imuData

    def angleCallback(self, angleData):
        self.__isAngleValid = True
        self.__angleData = angleData

    def run(self):
        rospy.spin()
Exemple #10
0
class CollectData:
    def __init__(self):
        rospy.init_node("CollectData", anonymous=True)
        rospy.Subscriber("robot_odm", Pose2D, self.odmCallback)
        rospy.Subscriber("openpose_ros/human_depth_list", HumanDepthList,
                         self.humanListCallback)
        rospy.Subscriber("/robot/controlSignal", Bool,
                         self.controlSignalCallback)
        rospy.Subscriber("/imu", Imu, self.imuCallback)
        rospy.Subscriber("/imu_angle", Vector3, self.angleCallback)
        self.__controlSignal = False
        self.__controlState = 0
        self.__isAngleValid = False
        self.__isImuValid = False
        self.__extract = Extractor()
        self.__data = []

    def odmCallback(self, odm):
        self.__odm = odm

    def controlSignalCallback(self, controlSignal):
        if controlSignal.data is True:
            time.sleep(12)
        self.__controlSignal = controlSignal.data

    def humanListCallback(self, humanDepthList):
        if self.__isImuValid is False or self.__isAngleValid is False:
            return
        if self.__controlState == 0:
            # 控制状态为0表明还未开始记录
            if self.__controlSignal is False:
                return
            else:
                print "start recording!"
                self.__controlState = 1
        if self.__controlState == 1:
            if self.__controlSignal is True:
                # 记录数据
                curData = self.__extract.extractHumanPose(
                    humanDepthList, self.__odm)
                curData.extend([
                    self.__angleData.x, self.__angleData.y, self.__angleData.z,
                    self.__imuData.linear_acceleration.x,
                    self.__imuData.linear_acceleration.y,
                    self.__imuData.linear_acceleration.z,
                    self.__imuData.angular_velocity.x,
                    self.__imuData.angular_velocity.y,
                    self.__imuData.angular_velocity.z,
                    time.time(), self.__odm.x, self.__odm.y, self.__odm.theta
                ])
                self.__data.append(curData)
            else:
                self.__controlState = 0
                # 将数据存成文件并且清空数据
                nowTime = time.strftime("%Y-%m-%d-%H_%M_%S",
                                        time.localtime(time.time()))
                fileName = nowTime + ' Turn1' + '.npy'
                result = plotData(self.__data)
                if result is True:
                    np.save(fileName, self.__data)
                    print "save as " + fileName
                else:
                    print "wrong, reject"
                self.__data = []
                self.__extract = Extractor()

    def imuCallback(self, imuData):
        self.__isImuValid = True
        self.__imuData = imuData

    def angleCallback(self, angleData):
        self.__isAngleValid = True
        self.__angleData = angleData

    def run(self):
        rospy.spin()
Exemple #11
0
class Source(models.Model):
    """ This could be a single site or part of a site which contains wanted
        content
    """
    url = models.CharField(max_length=256)
    name = models.CharField(max_length=256, blank=True, null=True)
    # Links section
    link_xpath = models.CharField(max_length=255)
    expand_rules = models.TextField(blank=True, null=True)
    crawl_depth = models.PositiveIntegerField(default=1)
    # Content section
    content_xpath = models.CharField(max_length=255, blank=True, null=True)
    content_type = models.ForeignKey('ContentType', blank=True, null=True)
    meta_xpath = models.TextField(default='', blank=True)
    extra_xpath = models.TextField(default='', blank=True)
    refine_rules = models.TextField(default='', blank=True)
    active = models.BooleanField(default=True)
    download_image = models.BooleanField(default=True)
    # Extra settings
    black_words = models.ForeignKey('WordSet', blank=True, null=True)
    proxy = models.ForeignKey('ProxyServer', blank=True, null=True)
    user_agent = models.ForeignKey('UserAgent', blank=True, null=True)

    def __unicode__(self):
        return '%s' % (self.name or self.url)

    def get_extractor(self):
        return self._extractor

    def crawl(self, download=True):
        logger.info('')
        logger.info('Start crawling %s (%s)' % (self.name, self.url))

        # Custom definitions
        metapath = eval(self.meta_xpath) if self.meta_xpath else None
        expand_rules = self.expand_rules.split('\n') \
            if self.expand_rules else None
        refine_rules = [item.strip() for item in self.refine_rules.split('\n')
                        if item.strip()]
        extrapath = [item.strip() for item in self.extra_xpath.split('\n')
                     if item.strip()]

        proxy = self.proxy.get_dict() if self.proxy else None
        logger.info('Use proxy server: %s' % self.proxy)

        ua = self.user_agent.value if self.user_agent else None
        logger.info('Use user agent: %s' % self.user_agent)

        # Initialize extractor
        self._extractor = Extractor(self.url, settings.CRAWL_ROOT,
                                    proxies=proxy, user_agent=ua)
        make_root = False
        if self.link_xpath.startswith('/+'):
            make_root = True
            self.link_xpath = self.link_xpath[2:]

        all_links = self._extractor.extract_links(
            xpath=self.link_xpath,
            expand_rules=expand_rules,
            depth=self.crawl_depth,
            make_root=make_root)
        logger.info('%d link(s) found' % len(all_links))

        # Just dry running or real download
        if download:
            blacklist = []
            local_content = []
            if self.black_words:
                blacklist = self.black_words.words.split('\n')
            for link in all_links:
                try:
                    link_url = link['url']
                    if LocalContent.objects.filter(url=link_url).count():
                        logger.info('Bypass %s' % link_url)
                        continue
                    logger.info('Download %s' % link_url)
                    location = datetime.now().strftime('%Y/%m/%d')
                    location = os.path.join(settings.CRAWL_ROOT, location)
                    sub_extr = Extractor(link_url, location, proxy)
                    if self.content_type:
                        base_meta = {'type': self.content_type.name}
                    else:
                        base_meta = None
                    local_path = sub_extr.extract_content(
                        self.content_xpath,
                        with_image=self.download_image,
                        metapath=metapath,
                        extrapath=extrapath,
                        custom_rules=refine_rules,
                        blacklist=blacklist,
                        metadata=base_meta)
                    content = LocalContent(url=link_url, source=self,
                                           local_path=local_path)
                    content.save()
                    local_content.append(content)
                except:
                    logger.exception('Error when extracting %s' % link['url'])
            paths = [lc.local_path for lc in local_content]
            return paths
        else:
            return all_links
Exemple #12
0
 def __init__(self):
     self.IO = IOManager()
     self.exactor = Extractor('illegal.txt')
Exemple #13
0
class DIRT:
    def __init__(self):
        self.IO = IOManager()
        self.exactor = Extractor('illegal.txt')

    def _construct_database(self, corpus_path):
        """Construct the database based on the corpus.

        Args:
            corpus_path: str - The path of corpus.

        Returns:
            Database
        """
        database = Database()
        # construct the database
        for words, poss in self.IO.read_sentences(corpus_path):
            triples = self.exactor.extract(words, poss)
            for triple in triples:
                database.insert(triple)

        return database

    def run(self, corpus_path, test_path, minfreq):
        self._database = self._construct_database(corpus_path)
        before_unique, before_total = self._stas(self._database)
        self._database.apply_minfreq(minfreq)
        after_unique, after_total = self._stas(self._database)
        sim = Similarity(self._database)
        test_phrases = self.IO.read_phrases(test_path)

        with open('trace.txt', 'w', encoding='utf8') as f:
            # Write the head line.
            args = [before_unique, after_unique, before_total, after_total]
            f.write('\n')
            self._write_head(f, args)

            for phrase in test_phrases:
                most_similar = self._find_k_similar(phrase, sim, 5)
                self._write_result(f, phrase, most_similar)

    def _stas(self, database):
        """Return the statistic of the database.
        """
        return len(database), database.path_number()

    def _write_head(self, f, args):
        """Write the head line for output.
        """
        s = 'Found {a} distinct paths, {b} after minfreq filtering.\n'
        s = s.format(a=args[0], b=args[1])
        f.write(s)

        s = 'Found {a} path instances, {b} after minfreq filtering.\n'
        s = s.format(a=args[2], b=args[3])
        f.write(s)
        f.write('\n')

    def _find_k_similar(self, phrase, sim, k=5):
        """Find the k most similar paths.

        If phrase does not in the database,  reutrn None

        Args:
            phrase: str
            sim: Similarity
            k: int

        Returns:
            a list of tuple with size of k.
            Each tuple contains the path and corrsponding score.
        """
        if phrase not in self._database:
            return None

        reval = [(path, sim.PathSim(phrase, path)) for path in self._database]
        reval.sort(key=lambda x: x[-1], reverse=True)
        # To deal with tie cases.
        value = reval[k - 1][-1]
        reval = [v for v in reval if v[-1] >= value]
        return reval

    def _write_result(self, f, phrase, result):
        """Write thr result into files.

        Args:
            f: file
            phrase: str
            result: list(tuple(path, score))
        """
        s = 'MOST SIMILAR RULES FOR: {a}\n'.format(a=phrase)
        n = 'This phrase is not in the triple database.\n'
        t = '{a}. {b}\t{c}\n'
        f.write(s)
        if result is None:
            f.write(n)
        else:
            for i, item in enumerate(result):
                path = str(item[0])
                score = str(item[-1])
                tt = t.format(a=str(i + 1), b=path, c=score)
                f.write(tt)
        f.write('\n')
Exemple #14
0
# Initialize Corpuses
corpus = {}
for year in years:
    corpus[year] = Corpus(year=year - 2015)

# Build Corpuses: Load cleaned articles, build phrasers, dictionary, and BOWs
for year in years:
    print("Corpus " + str(year) + ":")
    corpus[year].build_corpus()
    print("Corpus " + str(year) + " Done\n")

# Extract Keywords from each Article using tf-ifd
for year in years:
    print("Corpus " + str(year) + ":")
    corpus[year].build_tfidf()
    corpus[year].extractor = Extractor(corpus[year])
    corpus[year].extractor.extract(k=num_keywords)
    print("Corpus " + str(year) + " Done\n")

# Build LDA model, cluster articles into issues
for year in years:
    print("Corpus " + str(year) + ":")
    corpus[year].build_lda(num_topics=num_issues)
    corpus[year].issue_model = IssueModel(corpus=corpus[year],
                                          model=corpus[year].lda)
    corpus[year].issue_model.build_issues()
    print("Corpus " + str(year) + " Done\n")

# Init Issues (for Issue Tracking)
issues = []
for year in years:
Exemple #15
0
    def crawl(self, download=True):
        logger.info('')
        logger.info('Start crawling %s (%s)' % (self.name, self.url))

        # Custom definitions
        metapath = eval(self.meta_xpath)
        rules = [item.strip() for item in self.refine_rules.split('\n')
                 if item.strip()]
        extrapath = [item.strip() for item in self.extra_xpath.split('\n')
                     if item.strip()]
        proxy = {}
        if self.proxy:
            logger.info('Use proxy server: %s' % self.proxy.address)
            proxy = self.proxy.get_dict()
        else:
            proxy = None
        if self.user_agent:
            logger.info('Use user agent: %s' % self.user_agent.name)
            ua = self.user_agent.value
        else:
            ua = None
        extractor = Extractor(self.url, settings.CRAWL_ROOT, 
                              proxies=proxy, user_agent=ua)
        all_links = extractor.extract_links(
            xpath=self.link_xpath,
            expand_rules=self.expand_rules.split('\n'),
            depth=self.crawl_depth)
        logger.info('%d link(s) found' % len(all_links))

        if download:
            blacklist = []
            if self.black_words:
                blacklist = self.black_words.words.split('\n')
            for link in all_links:
                try:
                    link_url = link['url']
                    if LocalContent.objects.filter(url=link_url).count():
                        logger.info('Bypass %s' % link_url)
                        continue
                    logger.info('Download %s' % link_url)
                    location = datetime.now().strftime('%Y/%m/%d')
                    location = os.path.join(settings.CRAWL_ROOT, location)
                    sub_extr = Extractor(link_url, location, proxy)
                    if self.content_type:
                        base_meta = {'type': self.content_type.name}
                    else:
                        base_meta = None
                    local_path = sub_extr.extract_content(
                        self.content_xpath,
                        with_image=self.download_image,
                        metapath=metapath,
                        extrapath=extrapath,
                        custom_rules=rules,
                        blacklist=blacklist,
                        metadata=base_meta)
                    content = LocalContent(url=link_url, source=self,
                                           local_path=local_path)
                    content.save()
                except:
                    logger.exception('Error when extracting %s' % link['url'])
        else:
            return all_links
Exemple #16
0
    def humanListCallback(self, humanDepthList):
        if self.__isImuValid is False or self.__isAngleValid is False:
            return
        if self.__controlState == 0:
            # 控制状态为0表明还未开始记录
            if self.__controlSignal is False:
                return
            else:
                print "start recording!"
                self.__controlState = 1
        if self.__controlState == 1:
            if self.__controlSignal is True:
                # 记录数据
                curData = self.__extract.extractHumanPose(
                    humanDepthList, self.__odm)
                curData.extend([
                    self.__angleData.x, self.__angleData.y, self.__angleData.z,
                    self.__imuData.linear_acceleration.x,
                    self.__imuData.linear_acceleration.y,
                    self.__imuData.linear_acceleration.z,
                    self.__imuData.angular_velocity.x,
                    self.__imuData.angular_velocity.y,
                    self.__imuData.angular_velocity.z
                ])
                self.__data.append(curData)
                nowLocation = Point()
                nowLocation.x = curData[0]
                nowLocation.y = curData[1]
                nowLocation.z = 0
                self.__pubLocation.publish(nowLocation)
                if self.__isAngleInit is False:
                    self.__initAngleData.append(self.__angleData.z)
                    if len(self.__initAngleData) >= 8:
                        self.__isAngleInit = True
                        self.__angleOffset = sum(self.__initAngleData) / len(
                            self.__initAngleData)
                        self.__initAngleData = []
                if len(self.__data) >= 11:
                    self.__isPredictionValid = True
                if len(self.__data) > 11:
                    del self.__data[0]
                if self.__isPredictionValid is True:
                    # print self.__odm
                    npTrajectory = np.array(self.__data)
                    cur = time.time()
                    npFuture = self.__model.predictFuture(
                        npTrajectory[:, srcIndex])
                    Weights = np.matmul(T_inv, npFuture)
                    # plotLocationData(npTrajectory.transpose(), npFuture.transpose())
                    self.__collectData.append([
                        npTrajectory[:, srcIndex].transpose(),
                        npFuture.transpose()
                    ])
                    futureTrajectory = FutureTrajectory()
                    for i in range(0, 8):
                        point = Point()
                        point.x = npFuture[i][0]
                        point.y = npFuture[i][1]
                        point.z = 0
                        futureTrajectory.locations.append(point)
                    for i in range(0, 3):
                        for j in range(0, 2):
                            futureTrajectory.weights.append(
                                Float64(Weights[i][j] / 1000))
                    futureTrajectory.weights.append(
                        Float64(self.__angleData.z - self.__angleOffset))
                    print((self.__angleData.z - self.__angleOffset) / 180 *
                          3.14159)
                    now = time.time()
                    # print( now - cur )
                    self.__pubFuture.publish(futureTrajectory)

            else:
                self.__controlState = 0
                # 将数据存成文件并且清空数据
                nowTime = time.strftime("%Y-%m-%d-%H_%M_%S",
                                        time.localtime(time.time()))
                fileName = nowTime + ' Line3' + '.npy'
                # plotData(self.__data)
                np.save(fileName, self.__data)
                self.__data = []
                self.__extract = Extractor()
                print "save as " + fileName
                self.__isAngleInit = False
                '''
class DOMHTMLSearchMovieParser(DOMParserBase):
    """Parse the html page that the IMDb web server shows when the
    "new search system" is used, for movies."""

    _BaseParser = DOMBasicMovieParser
    _notDirectHitTitle = '<title>imdb title'
    _titleBuilder = lambda self, x: build_title(x, canonical=True)
    _linkPrefix = '/title/tt'

    _attrs = [
        Attribute(key='data',
                  multi=True,
                  path={
                      'link': "./a[1]/@href",
                      'info': ".//text()"
                  },
                  postprocess=lambda x:
                  (analyze_imdbid(x.get('link') or u''),
                   analyze_title(x.get('info') or u'', canonical=1)))
    ]
    extractors = [
        Extractor(label='search',
                  path="//td[3]/a[starts-with(@href, '/title/tt')]/..",
                  attrs=_attrs)
    ]

    def _init(self):
        self.url = u''

    def _reset(self):
        self.url = u''

    def preprocess_string(self, html_string):
        if self._notDirectHitTitle in html_string[:1024].lower():
            if self._linkPrefix == '/title/tt':
                # Only for movies.
                html_string = html_string.replace('(TV mini-series)', '(mini)')
                html_string = _reAKAS.sub('</td>', html_string)
            return html_string
        # Direct hit!
        dbme = self._BaseParser(useModule=self._useModule)
        res = dbme.parse(html_string, url=self.url)
        if not res: return u''
        res = res['data']
        if not (res and res[0]): return u''
        link = '%s%s' % (self._linkPrefix, res[0][0])
        #    # Tries to cope with companies for which links to pro.imdb.com
        #    # are missing.
        #    link = self.url.replace(imdbURL_base[:-1], '')
        title = self._titleBuilder(res[0][1])
        if not (link and title): return u''
        link = link.replace('http://pro.imdb.com', '')
        new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link,
                                                                       title)
        return new_html

    def postprocess_data(self, data):
        if not data.has_key('data'):
            data['data'] = []
        results = getattr(self, 'results', None)
        if results is not None:
            data['data'][:] = data['data'][:results]
        return data

    def add_refs(self, data):
        return data
Train = True

if version == 1:
    depth = n * 6 + 2
elif version == 2:
    depth = n * 9 + 2

# Model name, depth and version
model_type = 'ResNet_%s' % (model_key)

train_txt_fp = config.get("FILE_PATH", 'train_imagepath_label')
train_pt = './train/train/'
class_wordembedings_txt_fp = config.get("FILE_PATH",
                                        'class_wordembeddings_reduced_100')
label_list_fp = config.get('FILE_PATH', 'lable_list')
x_train, y_train, x_test, y_test = Extractor.readTrainDataVersion2(
    train_txt_fp, train_pt, class_wordembedings_txt_fp, label_list_fp)
print(y_train.shape)

input_shape = x_train.shape[1:]
x_train = x_train.astype('float32') / 255
x_test = x_test.astype('float32') / 255
if subtract_pixel_mean:
    x_train_mean = np.mean(x_train, axis=0)
    x_train -= x_train_mean
    x_test -= x_train_mean

print('x_train shape:', x_train.shape)
print('y_train shape:', y_train.shape)

save_dir = os.path.join(os.getcwd(), 'saved_models')
model_name = 'cifar10_%s_model.{epoch:03d}.{val_acc:03f}.h5' % model_type
class DOMHTMLBioParser(DOMParserBase):
    """Parser for the "biography" page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        bioparser = DOMHTMLBioParser()
        result = bioparser.parse(biography_html_string)
    """
    _defGetRefs = True

    _birth_attrs = [Attribute(key='birth date',
                        path={
                            'day': "./a[starts-with(@href, " \
                                    "'/search/name?birth_monthday=')]/text()",
                            'year': "./a[starts-with(@href, " \
                                    "'/search/name?birth_year=')]/text()"
                            },
                        postprocess=build_date),
                    Attribute(key='birth notes',
                        path="./a[starts-with(@href, " \
                                "'/search/name?birth_place=')]/text()")]
    _death_attrs = [Attribute(key='death date',
                        path={
                            'day': "./a[starts-with(@href, " \
                                    "'/search/name?death_monthday=')]/text()",
                            'year': "./a[starts-with(@href, " \
                                    "'/search/name?death_date=')]/text()"
                            },
                        postprocess=build_date),
                    Attribute(key='death notes',
                        path="./text()",
                        # TODO: check if this slicing is always correct
                        postprocess=lambda x: u''.join(x).strip()[2:])]
    extractors = [
            Extractor(label='headshot',
                        path="//a[@name='headshot']",
                        attrs=Attribute(key='headshot',
                            path="./img/@src")),
            Extractor(label='birth info',
                        path="//table[@id='overviewTable']//td[text()='Date of Birth']/following-sibling::td[1]",
                        attrs=_birth_attrs),
            Extractor(label='death info',
                        path="//table[@id='overviewTable']//td[text()='Date of Death']/following-sibling::td[1]",
                        attrs=_death_attrs),
            Extractor(label='nick names',
                        path="//table[@id='overviewTable']//td[text()='Nickenames']/following-sibling::td[1]",
                        attrs=Attribute(key='nick names',
                            path="./text()",
                            joiner='|',
                            postprocess=lambda x: [n.strip().replace(' (',
                                    '::(', 1) for n in x.split('|')
                                    if n.strip()])),
            Extractor(label='birth name',
                        path="//table[@id='overviewTable']//td[text()='Birth Name']/following-sibling::td[1]",
                        attrs=Attribute(key='birth name',
                            path="./text()",
                            postprocess=lambda x: canonicalName(x.strip()))),
            Extractor(label='height',
                path="//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]",
                        attrs=Attribute(key='height',
                            path="./text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='mini biography',
                        path="//a[@name='mini_bio']/following-sibling::div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]",
                        attrs=Attribute(key='mini biography',
                            multi=True,
                            path={
                                'bio': ".//text()",
                                'by': ".//a[@name='ba']//text()"
                                },
                            postprocess=lambda x: "%s::%s" % \
                                ((x.get('bio') or u'').split('- IMDb Mini Biography By:')[0].strip(),
                                (x.get('by') or u'').strip() or u'Anonymous'))),
            Extractor(label='spouse',
                        path="//div[h5='Spouse']/table/tr",
                        attrs=Attribute(key='spouse',
                            multi=True,
                            path={
                                'name': "./td[1]//text()",
                                'info': "./td[2]//text()"
                                },
                            postprocess=lambda x: ("%s::%s" % \
                                (x.get('name').strip(),
                                (x.get('info') or u'').strip())).strip(':'))),
            Extractor(label='trade mark',
                        path="//div[h5='Trade Mark']/p",
                        attrs=Attribute(key='trade mark',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='trivia',
                        path="//div[h5='Trivia']/p",
                        attrs=Attribute(key='trivia',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='quotes',
                        path="//div[h5='Personal Quotes']/p",
                        attrs=Attribute(key='quotes',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            Extractor(label='salary',
                        path="//div[h5='Salary']/table/tr",
                        attrs=Attribute(key='salary history',
                            multi=True,
                            path={
                                'title': "./td[1]//text()",
                                'info': "./td[2]/text()",
                                },
                            postprocess=lambda x: "%s::%s" % \
                                    (x.get('title').strip(),
                                        x.get('info').strip()))),
            Extractor(label='where now',
                        path="//div[h5='Where Are They Now']/p",
                        attrs=Attribute(key='where now',
                            multi=True,
                            path=".//text()",
                            postprocess=lambda x: x.strip())),
            ]

    preprocessors = [(re.compile('(<h5>)',
                                 re.I), r'</div><div class="_imdbpy">\1'),
                     (re.compile('(</table>\n</div>\s+)</div>',
                                 re.I + re.DOTALL), r'\1'),
                     (re.compile('(<div id="tn15bot">)'), r'</div>\1'),
                     (re.compile('\.<br><br>([^\s])', re.I), r'. \1')]

    def postprocess_data(self, data):
        for what in 'birth date', 'death date':
            if what in data and not data[what]:
                del data[what]
        return data
Exemple #20
0
"""
Author: Sulley
Date: 2020.2.29
"""

import chardet
import codecs
import os
import sys
import csv
import xlrd
import docx
import jieba
import itertools, string
from pypinyin import pinyin, lazy_pinyin, Style
from PyQt5.Qt import *
from utils import Converter, Counter, Extractor, Corpus, Lexicon
from window import Window, EmittingStream

if __name__ == '__main__':
    converter = Converter()
    counter = Counter(converter)
    extractor = Extractor(converter)
    corpus = Corpus()
    lexicon = Lexicon()

    app = QApplication(sys.argv)
    exe = Window(converter, counter, extractor, corpus, lexicon)
    sys.exit(app.exec_())
class DOMHTMLMaindetailsParser(DOMParserBase):
    """Parser for the "categorized" (maindetails) page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        cparser = DOMHTMLMaindetailsParser()
        result = cparser.parse(categorized_html_string)
    """
    _containsObjects = True
    _name_imdb_index = re.compile(r'\([IVXLCDM]+\)')

    _birth_attrs = [Attribute(key='birth date',
                        path='.//time[@itemprop="birthDate"]/@datetime'),
                    Attribute(key='birth place',
                        path=".//a[starts-with(@href, " \
                                "'/search/name?birth_place=')]/text()")]
    _death_attrs = [Attribute(key='death date',
                        path='.//time[@itemprop="deathDate"]/@datetime'),
                    Attribute(key='death place',
                        path=".//a[starts-with(@href, " \
                                "'/search/name?death_place=')]/text()")]
    _film_attrs = [
        Attribute(key=None,
                  multi=True,
                  path={
                      'link': "./b/a[1]/@href",
                      'title': "./b/a[1]/text()",
                      'notes': "./b/following-sibling::text()",
                      'year': "./span[@class='year_column']/text()",
                      'status': "./a[@class='in_production']/text()",
                      'rolesNoChar': './/br/following-sibling::text()',
                      'chrRoles': "./a[@imdbpyname]/@imdbpyname",
                      'roleID': "./a[starts-with(@href, '/character/')]/@href"
                  },
                  postprocess=lambda x: build_movie(
                      x.get('title') or u'',
                      year=x.get('year'),
                      movieID=analyze_imdbid(x.get('link') or u''),
                      rolesNoChar=(x.get('rolesNoChar') or u'').strip(),
                      chrRoles=(x.get('chrRoles') or u'').strip(),
                      additionalNotes=x.get('notes'),
                      roleID=(x.get('roleID') or u''),
                      status=x.get('status') or None))
    ]

    extractors = [
            Extractor(label='name',
                        path="//h1[@class='header']",
                        attrs=Attribute(key='name',
                            path=".//text()",
                            postprocess=lambda x: analyze_name(x,
                                                               canonical=1))),
            Extractor(label='name_index',
                        path="//h1[@class='header']/span[1]",
                        attrs=Attribute(key='name_index',
                            path="./text()")),

            Extractor(label='birth info',
                        path="//div[h4='Born:']",
                        attrs=_birth_attrs),

            Extractor(label='death info',
                        path="//div[h4='Died:']",
                        attrs=_death_attrs),

            Extractor(label='headshot',
                        path="//td[@id='img_primary']/div[@class='image']/a",
                        attrs=Attribute(key='headshot',
                            path="./img/@src")),

            Extractor(label='akas',
                        path="//div[h4='Alternate Names:']",
                        attrs=Attribute(key='akas',
                            path="./text()",
                            postprocess=lambda x: x.strip().split('  '))),

            Extractor(label='filmography',
                        group="//div[starts-with(@id, 'filmo-head-')]",
                        group_key="./a[@name]/text()",
                        group_key_normalize=lambda x: x.lower().replace(': ', ' '),
                        path="./following-sibling::div[1]" \
                                "/div[starts-with(@class, 'filmo-row')]",
                        attrs=_film_attrs),

            Extractor(label='indevelopment',
                        path="//div[starts-with(@class,'devitem')]",
                        attrs=Attribute(key='in development',
                            multi=True,
                            path={
                                'link': './a/@href',
                                'title': './a/text()'
                                },
                                postprocess=lambda x:
                                    build_movie(x.get('title') or u'',
                                        movieID=analyze_imdbid(x.get('link') or u''),
                                        roleID=(x.get('roleID') or u'').split('/'),
                                        status=x.get('status') or None)))
            ]

    preprocessors = [
        ('<div class="clear"/> </div>', ''), ('<br/>', '<br />'),
        (re.compile(r'(<a href="/character/ch[0-9]{7}")>(.*?)</a>'),
         r'\1 imdbpyname="\2@@">\2</a>')
    ]

    def postprocess_data(self, data):
        for what in 'birth date', 'death date':
            if what in data and not data[what]:
                del data[what]
        name_index = (data.get('name_index') or '').strip()
        if name_index:
            if self._name_imdb_index.match(name_index):
                data['imdbIndex'] = name_index[1:-1]
            del data['name_index']
        # XXX: the code below is for backwards compatibility
        # probably could be removed
        for key in data.keys():
            if key.startswith('actor '):
                if not data.has_key('actor'):
                    data['actor'] = []
                data['actor'].extend(data[key])
                del data[key]
            if key.startswith('actress '):
                if not data.has_key('actress'):
                    data['actress'] = []
                data['actress'].extend(data[key])
                del data[key]
            if key.startswith('self '):
                if not data.has_key('self'):
                    data['self'] = []
                data['self'].extend(data[key])
                del data[key]
            if key == 'birth place':
                data['birth notes'] = data[key]
                del data[key]
            if key == 'death place':
                data['death notes'] = data[key]
                del data[key]
        return data
model_save_fp = config.get("MODEL", 'model_save_fp')
train_label_20_fp = config.get('FILE_PATH', 'train_label_20')
train_lable_fp = config.get('MODEL', 'train_lable')
train_imageName_Lable_fp = config.get("MODEL", 'train_imageName_Lable_fp')
image_path = config.get("MODEL", 'image_path')

if version == 1:
    depth = n * 6 + 2
elif version == 2:
    depth = n * 9 + 2


model_type = 'ResNet_%s' % (model_key)
save_dir = os.path.join(config.get('MODEL', 'data_pre_pt'), 'saved_models')

x_train, y_train, x_test, y_test, train_cate_num = Extractor.gainTrainAndTest(train_lable_fp, train_imageName_Lable_fp, image_path)
num_classes = train_cate_num
input_shape = x_train.shape[1:]
x_train = x_train.astype('float32') / 255
x_test = x_test.astype('float32') / 255

if subtract_pixel_mean:
    x_train_mean = np.mean(x_train, axis=0)
    x_train -= x_train_mean
    x_test -= x_train_mean
print('x_train shape:', x_train.shape)
print('y_train shape:', y_train.shape)

model_name = 'cifar10_%s_model.{epoch:03d}.{val_acc:03f}.h5' % model_type
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
class DOMHTMLSearchMovieParser(DOMParserBase):
    """Parse the html page that the IMDb web server shows when the
    "new search system" is used, for movies."""

    _BaseParser = DOMBasicMovieParser
    _notDirectHitTitle = '<title>find - imdb</title>'
    _titleBuilder = lambda self, x: build_title(x)
    _linkPrefix = '/title/tt'

    _attrs = [
        Attribute(key='data',
                  multi=True,
                  path={
                      'link': "./a[1]/@href",
                      'info': ".//text()",
                      'akas': "./i//text()"
                  },
                  postprocess=lambda x:
                  (analyze_imdbid(x.get('link') or u''),
                   custom_analyze_title(x.get('info') or u''), x.get('akas')))
    ]
    extractors = [
        Extractor(label='search',
                  path="//td[@class='result_text']",
                  attrs=_attrs)
    ]

    def _init(self):
        self.url = u''

    def _reset(self):
        self.url = u''

    def preprocess_string(self, html_string):

        if self._notDirectHitTitle in html_string[:10240].lower():
            if self._linkPrefix == '/title/tt':
                # Only for movies.
                # XXX (HTU): does this still apply?
                html_string = html_string.replace('(TV mini-series)', '(mini)')
            return html_string
        # Direct hit!
        dbme = self._BaseParser(useModule=self._useModule)
        res = dbme.parse(html_string, url=self.url)
        if not res: return u''
        res = res['data']
        if not (res and res[0]): return u''
        link = '%s%s' % (self._linkPrefix, res[0][0])
        #    # Tries to cope with companies for which links to pro.imdb.com
        #    # are missing.
        #    link = self.url.replace(imdbURL_base[:-1], '')
        title = self._titleBuilder(res[0][1])
        if not (link and title): return u''
        link = link.replace('http://pro.imdb.com', '')
        new_html = '<td class="result_text"><a href="%s">%s</a></td>' % (link,
                                                                         title)
        return new_html

    def postprocess_data(self, data):
        if not data.has_key('data'):
            data['data'] = []
        results = getattr(self, 'results', None)
        if results is not None:
            data['data'][:] = data['data'][:results]
        # Horrible hack to support AKAs.
        if data and data['data'] and len(data['data'][0]) == 3 and \
                isinstance(data['data'][0], tuple):
            data['data'] = [x for x in data['data'] if x[0] and x[1]]
            for idx, datum in enumerate(data['data']):
                if not isinstance(datum, tuple):
                    continue
                if not datum[0] and datum[1]:
                    continue
                if datum[2] is not None:
                    #akas = filter(None, datum[2].split('::'))
                    if self._linkPrefix == '/title/tt':
                        # XXX (HTU): couldn't find a result with multiple akas
                        aka = datum[2]
                        akas = [aka[1:-1]]  # remove the quotes
                        #akas = [a.replace('" - ', '::').rstrip() for a in akas]
                        #akas = [a.replace('aka "', '', 1).replace('aka  "',
                        #'', 1).lstrip() for a in akas]
                    datum[1]['akas'] = akas
                    data['data'][idx] = (datum[0], datum[1])
                else:
                    data['data'][idx] = (datum[0], datum[1])
        return data

    def add_refs(self, data):
        return data
model = load_model(mode_fp)
model.summary()

if config.getboolean('EXTRACT_IMAGE_FEATURE', 'extract_all_train'):
    print('getting all train fc vector...')
    model_key = config.get('EXTRACT_IMAGE_FEATURE', 'model_key')
    model_path = config.get("EXTRACT_IMAGE_FEATURE", 'model_path')
    if os.path.exists(model_path) == False:
        os.mkdir(model_path)
    save_fp = model_path + config.get('EXTRACT_IMAGE_FEATURE',
                                      'fc_vector_alltrain')
    train_lable_fp = config.get('EXTRACT_IMAGE_FEATURE', 'train_lable_fp')
    train_imageName_Lable_fp = config.get("EXTRACT_IMAGE_FEATURE",
                                          'train_imageName_Lable_fp')
    image_path_train = config.get("EXTRACT_IMAGE_FEATURE", 'image_path_train')
    X, Y = Extractor.gainVal(train_lable_fp, train_imageName_Lable_fp,
                             image_path_train)
    print("total x:", X.shape[0])
    X = X.astype('float32') / 255
    x_train, y_train, x_test, y_test, train_cate_num = Extractor.gainTrainAndTest(
        train_lable_fp, train_imageName_Lable_fp, image_path_train)
    x_train = x_train.astype('float32') / 255
    x_train_mean = np.mean(x_train, axis=0)
    X -= x_train_mean
    pre_y, pre_fc = model.predict(X, verbose=1)
    #print(MyFunction.computeAcc(Y, pre_y))
    MyFunction.saveFcLayer(pre_y, Y, pre_fc, save_fp, train_lable_fp)

if config.getboolean('EXTRACT_IMAGE_FEATURE', 'extract_val'):
    print("getting val fc vector...")
    model_key = config.get('MODEL', 'model_key')
    train_lable = config.get('MODEL', 'train_lable')