Beispiel #1
0
def parse_legislator_row(chamber, session, row):
    cells = row("td")
    party = get_text(cells[-1])
    district = get_text(cells[-2])
    name_cell = cells[0].contents
    if not name_cell: return None
    linked_name = name_cell[0]
    first_name = middle_name = last_name = full_name = suffix = ""
    try:
        link = linked_name['href']
        match = MEMBER_ID_PATTERN.match(link)
        member_id = match.groups()[0]
        url = urljoin(MEMBER_LIST_URL[chamber], link)
        full_name = " ".join(linked_name.contents)  # a list
        if full_name.find(",") != -1:
            (name, suffix) = full_name.split(",")
        else:
            name = full_name
            suffix = ""

        name_parts = name.split()
        if len(name_parts) == 2:
            (first_name, last_name) = name_parts
        elif len(name_parts) > 3:
            (first_name, middle_name) = name_parts[:2]
            last_name = " ".join(name_parts[2:])
        elif len(name_parts) == 3:
            first_name, middle_name, last_name = name_parts
        else:
            raise ValueError("Unexpected number of parts to %s" % full_name)
    except KeyError, e:
        return None
Beispiel #2
0
def parse_legislator_row(chamber, session, row):
    cells = row("td")
    party = get_text(cells[-1])
    district = get_text(cells[-2])
    name_cell = cells[0].contents
    if not name_cell: return None
    linked_name = name_cell[0]
    first_name = middle_name = last_name = full_name = suffix = ""
    try:
        link = linked_name['href']
        match = MEMBER_ID_PATTERN.match(link)
        member_id = match.groups()[0]
        url = urljoin(MEMBER_LIST_URL[chamber],link)
        full_name = " ".join(linked_name.contents) # a list
        if full_name.find(",") != -1:
            (name,suffix) = full_name.split(",")
        else:
            name = full_name
            suffix = ""

        name_parts = name.split()
        if len(name_parts) == 2:
            (first_name,last_name) = name_parts
        elif len(name_parts) > 3:
            (first_name,middle_name) = name_parts[:2]
            last_name = " ".join(name_parts[2:])
        elif len(name_parts) == 3:
            first_name,middle_name,last_name = name_parts
        else:
            raise ValueError("Unexpected number of parts to %s" % full_name)
    except KeyError, e:
        return None
Beispiel #3
0
    def testEntry(self):
        etree = fromstring(ENTRY2)
        util.set_text(etree, 'content', 'html', '<p>hello</p>')
        self.assertEqual(("html", "<p>hello</p>"), util.get_text('content', etree))

        util.set_text(etree, 'title', 'xhtml', '<p>hello</p>')
        self.assertEqual(("xhtml", "<p>hello</p>"), util.get_text('title', etree))

        util.set_text(etree, 'summary', 'text', '<p>hello</p>')
        self.assertEqual(("text", "<p>hello</p>"), util.get_text('summary', etree))
Beispiel #4
0
    def parse_item(self, response):

        source = 'wandoujia'

        name = util.get_text(response, '//p[@class="app-name"]/span/text()')
        if not name:
            return

        version = util.get_text(response, '//dl[@class="infos-list"]/dd[4]/text()')

        first = response.meta['first']
        second = util.get_text(response, '//div[@class="crumb"]/div[2]/a/span/text()')
        category = first + '-' + second

        time = util.get_text(response, '//time[@id="baidu_time"]/text()')

        size = util.get_text(response, '//dl[@class="infos-list"]/dd[1]/text()')

        system = util.get_text(response, '//dl[@class="infos-list"]/dd[5]/text()')

        text = util.get_text(response, '//div[@itemprop="description"]',0)

        download = util.get_text(response, '//i[@itemprop="interactionCount"]/@content')

        pingfen = ''

        tag = response.xpath('//dd[@class="tag-box"]//a/text()').extract()
        tags=','.join([i.strip() for i in tag])

        self.fileout.write(
            source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags
        )
        self.fileout.write('\n')
Beispiel #5
0
    def parse(self, response):
        applist = response.xpath("//dl[@class='down_list pd20']")
        for app in applist:
            app_url = util.get_text(app, "./dd[@class='down_title']/h2/a/@href")
            time = util.get_text(app, "./dd[@class='down_attribute align_l']/span[3]/text()")
            system = util.get_text(app, "./dd[@class='down_attribute align_l']/span[5]/text()")
            download = util.get_text(app, "./dd[@class='down_attribute align_l']/span[7]/text()")
            yield scrapy.Request(
                app_url, callback=self.parse_item, meta={"time": time, "system": system, "download": download}
            )

        next = response.xpath('//a[text()="下一页"]/@href').extract()
        if next:
            yield scrapy.Request(next[0], callback=self.parse)
Beispiel #6
0
    def parse_item(self, response):

        source = 'anzhi'

        name = util.get_text(response, '//div[@class="detail_line"]/h3//text()')
        if not name:
            return

        version = util.get_text(response, '//div[@class="detail_line"]/span//text()')[1:-1]

        first = response.meta['cate']

        data = response.xpath('//ul[@id="detail_line_ul"]/li//text()').extract()
        if len(data) == 7 :
            second = data[0][3:]
            download = data[1][3:]
            time = data[2][3:]
            size = data[3][3:]
            system = data[4][3:]

        if len(data) == 6 :
            second = data[0][3:]
            download = ''
            time = data[1][3:]
            size = data[2][3:]
            system = data[3][3:]

        category = first + '-' + second

        text = util.get_text(response, '//div[@class="app_detail_infor"]',0)

        pingfen = util.get_text(response, '//div[@id="stars_detail"]/@style')
        p = pingfen.split('-')
        if len(p) == 2:
            pingfen = '0.0'
        elif len(p) == 3:
            pingfen = p[2][:-3]
        try:
            pingfen = str(float(pingfen)/15*10)
        except Exception:
            pingfen =''

        tags=''

        self.fileout.write(
            source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags
        )
        self.fileout.write('\n')
Beispiel #7
0
    def parse_item(self, response):

        source = 'hiapk'

        name_and_version = util.get_text(response,
                                         "//div[@id='appSoftName']/text()")
        try:
            version = name_and_version.split('(')[1].split(')')[0]
            name = name_and_version.split('(')[0]
        except Exception:
            version = ''
            name = name_and_version
        if not name:
            return

        first = util.get_text(response, "//a[@id='categoryParent']/text()")
        second = util.get_text(response, "//a[@id='categoryLink']/text()")
        category = first + '-' + second

        time = util.get_text(
            response,
            '//div[@class="code_box_border"]/div[@class="line_content"][7]/span[2]/text()'
        )

        size = util.get_text(response, '//span[@id="appSize"]/text()')

        system = util.get_text(
            response,
            '//span[@class="font14 detailMiniSdk d_gj_line left"]/text()')

        text = util.get_text(response, '//pre[@id="softIntroduce"]', 0)

        download = util.get_text(
            response,
            '//div[@class="code_box_border"]/div[@class="line_content"][2]/span[2]/text()'
        )

        pingfen = util.get_text(response,
                                '//div[@id="appIconTips"]/div[1]/@class')
        try:
            pingfen = str(float(pingfen.split(" ")[2].split("_")[2]) * 2)
        except Exception:
            pingfen = ''

        tags = ''

        self.fileout.write(source + '\001' + name + '\001' + version + '\001' +
                           category + '\001' + util.unify_data(time) + '\001' +
                           size + '\001' + system + '\001' + text + '\001' +
                           util.unify_download_count(download) + '\001' +
                           pingfen + '\001' + tags)
        self.fileout.write('\n')
Beispiel #8
0
    def parse_item(self, response):

        source = 'applestore'

        name = util.get_text(
            response,
            '//div[@id="desktopContentBlockId"]//div[@id="title"]//h1/text()')
        if not name:
            return

        version = util.get_text(
            response,
            '//div[@id="left-stack"]//span[@itemprop="softwareVersion"]/text()'
        )

        first = '软件'
        second = util.get_text(
            response,
            '//div[@id="left-stack"]//span[@itemprop="applicationCategory"]/text()'
        )
        category = first + '-' + second

        time = util.get_text(
            response,
            '//div[@id="left-stack"]//span[@itemprop="datePublished"]/text()')

        size = ''

        system = util.get_text(
            response,
            '//div[@id="left-stack"]//span[@itemprop="operatingSystem"]/text()'
        )

        text = util.get_text(
            response,
            '//div[@class="center-stack"]/div[@class="product-review"]/p', 0)

        download = util.get_text(
            response,
            '//div[@class="extra-list customer-ratings"]/div[4]/span/text()')

        pingfen = util.get_text(
            response,
            '//div[@class="extra-list customer-ratings"]/div[4]/@aria-label')
        try:
            pingfen = str(float(pingfen.split('星, ')[0]) * 20)
        except Exception:
            pingfen = ''

        tags = ''

        self.fileout.write(source + '\001' + name + '\001' + version + '\001' +
                           category + '\001' + util.unify_data(time) + '\001' +
                           size + '\001' + system + '\001' + text + '\001' +
                           util.unify_download_count(download) + '\001' +
                           pingfen + '\001' + tags)
        self.fileout.write('\n')
Beispiel #9
0
    def parse(self, response):
        applist = response.xpath('//p[@class="f-s3 t-overflow"]/a/@href').extract()
        for app in applist:
            yield scrapy.Request('http://m.163.com'+app, callback = self.parse_item)

        next = util.get_text(response, '//a[text()="下一页"]/@href')
        if next != '#next':
            yield scrapy.Request('http://m.163.com'+next, callback = self.parse)
Beispiel #10
0
 def parse_page(self, response):
     app_list = response.xpath(
         '//div[@class="topic_before"]/a/@href').extract()
     cat = util.get_text(response, '//div[@class="l_box_title"]/h3/text()')
     for i in app_list:
         yield scrapy.Request('http://apk.91.com' + i,
                              callback=self.parse_item,
                              meta={'cat': cat})
Beispiel #11
0
def extract_actions(s):
    actions = []
    anchor = s("a",{'name':'actions'})[0]
    table = None
    for x in anchor.nextGenerator():
        if hasattr(x,'name') and  getattr(x,'name') == 'table':
            table = x
            break
    if table:
        cells = table("td", { "class": "content" }) # markup bad: only header row correctly wrapped in a "tr"!
        while cells:
            (date,chamber,action) = cells[0:3]
            date = get_text(date).replace("&nbsp;"," ").strip()
            chamber = standardize_chamber(get_text(chamber).lower())
            cells = cells[3:]
            action = get_text(action)
            actions.append((chamber,action,date))
    return actions
Beispiel #12
0
    def parse(self, response):
        cate = util.get_text(response, '//li[@class="current"]/a/text()')[2:]
        app_list = response.xpath('//div[@class="app_list border_three"]//div[@class="app_info"]//a/@href').extract()
        for i in app_list:
            yield scrapy.Request('http://www.anzhi.com' + i, callback=self.parse_item, meta={'cate':cate})

        next = response.xpath('//div[@class="pagebars"]//a[@class="next"]/@href').extract()
        if next:
            yield scrapy.Request('http://www.anzhi.com' + next[0], callback=self.parse)
Beispiel #13
0
def extract_actions(s):
    actions = []
    anchor = s("a",{'name':'actions'})[0]
    table = None
    for x in anchor.nextGenerator():
        if hasattr(x,'name') and  getattr(x,'name') == 'table':
            table = x
            break
    if table:
        cells = table("td", { "class": "content" }) # markup bad: only header row correctly wrapped in a "tr"!
        while cells:
            (date,chamber,action) = cells[0:3]
            date = get_text(date).replace("&nbsp;"," ").strip()
            chamber = standardize_chamber(get_text(chamber).lower())
            cells = cells[3:]
            action = get_text(action)
            actions.append((chamber,action,date))
    return actions
Beispiel #14
0
    def parse_item(self, response):

        source = 'yingyonghui'

        name = util.get_text(response, '//h1[@class="app-name"]/text()')
        if not name:
            return

        version = util.get_text(response,
                                '//div[@class="intro"]/p[1]/text()[2]')[3:]

        first = util.get_text(
            response, '//div[@class="breadcrumb centre-content"]/a[2]/text()')
        second = util.get_text(
            response, '//div[@class="breadcrumb centre-content"]/a[3]/text()')
        category = first + '-' + second

        time = util.get_text(response, '//div[@class="intro"]/p[1]/text()')[3:]

        size = util.get_text(response,
                             '//span[@class="app-statistic"]/text()[2]')
        try:
            size = size.split('大小:')[1].split(' 更新')[0]
        except Exception:
            size = ''

        system = util.get_text(response,
                               '//p[@class="art-content"][3]/text()[4]')[3:]

        text = util.get_text(response, '//div[@class="main-info"]/p[1]', 0)

        download = util.get_text(response,
                                 '//span[@class="app-statistic"]/text()')
        try:
            download = download.split('下载')[0]
        except Exception:
            download = ''

        pingfen = ''

        tags = ''

        self.fileout.write(source + '\001' + name + '\001' + version + '\001' +
                           category + '\001' + util.unify_data(time) + '\001' +
                           size + '\001' + system + '\001' + text + '\001' +
                           util.unify_download_count(download) + '\001' +
                           pingfen + '\001' + tags)
        self.fileout.write('\n')
Beispiel #15
0
    def parse_item(self, response):

        source = 'apk91'

        name = util.get_text(response, '//h1[@class="ff f20 fb fl"]/text()')
        if not name:
            return

        version = util.get_text(response,
                                '//ul[@class="s_info"]/li[1]/text()')[3:]

        first = util.get_text(response,
                              '//div[@class="crumb clearfix"]/a[2]/text()')
        second = response.meta['cat']
        category = first + '-' + second

        time = util.get_text(response,
                             '//ul[@class="s_info"]/li[5]/text()')[5:15]

        size = util.get_text(response,
                             '//ul[@class="s_info"]/li[3]/text()')[5:]

        system = util.get_text(response,
                               '//ul[@class="s_info"]/li[4]/text()')[5:]

        text = util.get_text(response, '//div[@class="o-content"]', 0)

        download = util.get_text(response,
                                 '//ul[@class="s_info"]/li[2]/text()')

        pingfen = util.get_text(
            response,
            '//div[@class="s_intro_pic fl"]/span[@class="spr star"]/a/@class')
        try:
            pingfen = str(float(pingfen.split('w')[1].split(' ')[0]) * 20)
        except Exception:
            pingfen = ''

        tag = response.xpath('//ul[@class="s_info"]/li[10]/a/text()').extract()
        tags = ','.join(tag)

        self.fileout.write(source + '\001' + name + '\001' + version + '\001' +
                           category + '\001' + util.unify_data(time) + '\001' +
                           size + '\001' + system + '\001' + text + '\001' +
                           util.unify_download_count(download) + '\001' +
                           pingfen + '\001' + tags)
        self.fileout.write('\n')
Beispiel #16
0
    def parse(self, response):
        applist = response.xpath(
            '//p[@class="f-s3 t-overflow"]/a/@href').extract()
        for app in applist:
            yield scrapy.Request('http://m.163.com' + app,
                                 callback=self.parse_item)

        next = util.get_text(response, '//a[text()="下一页"]/@href')
        if next != '#next':
            yield scrapy.Request('http://m.163.com' + next,
                                 callback=self.parse)
def main():
    parser = argparse.ArgumentParser(description="Remove duplicate exception aliases.")

    parser.add_argument('file', metavar='FILE', type=str,
                        help="File to remove duplicates from.")
    parser.add_argument('--output', dest='output', action='store', default='FILE',
                        help="File to write result to. (default: %(default)s)")

    parser.add_argument('--write', dest='write', action='store_true',
                        help="Disable prompt, write to OUTPUT file automatically")

    args = parser.parse_args()

    if args.output == 'FILE':
        args.output = args.file

    print "file", '"' + args.file + '"'
    print "output", '"' + args.output + '"'
    print "write", args.write
    print

    if not os.path.exists(args.file):
        print "ERROR: file", '"' + args.file + '"', "does not exist"
        return

    data = remove_duplicates(args.file)

    output_original = get_text(args.output)

    data_len = len(data.replace('\n', '').replace('\r', ''))
    output_original_len = len(output_original.replace('\n', '').replace('\r', ''))

    print abs(data_len - output_original_len), "bytes changed"

    if data_len - output_original_len == 0:
        print "nothing to save"
        return

    print 'validating data:',
    if not validate(data, trace=True):
        return

    if not args.write:
        args.write = raw_input('Write to "' + args.output + '"? [no]: ') == 'yes'

    if args.write:
        f = open(args.output, 'wb')
        f.write(data)
        f.close()
        print "done"
    else:
        print 'not writing data'
Beispiel #18
0
    def parse(self, response):
        applist = response.xpath("//dl[@class='down_list pd20']")
        for app in applist:
            app_url = util.get_text(app,
                                    "./dd[@class='down_title']/h2/a/@href")
            time = util.get_text(
                app, "./dd[@class='down_attribute align_l']/span[3]/text()")
            system = util.get_text(
                app, "./dd[@class='down_attribute align_l']/span[5]/text()")
            download = util.get_text(
                app, "./dd[@class='down_attribute align_l']/span[7]/text()")
            yield scrapy.Request(app_url,
                                 callback=self.parse_item,
                                 meta={
                                     "time": time,
                                     "system": system,
                                     "download": download
                                 })

        next = response.xpath('//a[text()="下一页"]/@href').extract()
        if next:
            yield scrapy.Request(next[0], callback=self.parse)
Beispiel #19
0
    def parse_item(self, response):

        source = '3310'

        name_version = util.get_text(response,
                                     '//div[@class="cont"]/h2/text()')
        if not name_version:
            return

        ns = name_version.split(' ')
        version = ns.pop(-1)
        name = ' '.join(ns)

        first = util.get_text(response, '//div[@class="guide"]/a[3]/text()')
        second = util.get_text(response, '//div[@class="guide"]/a[4]/text()')
        category = first + '-' + second

        time = util.get_text(response, '//div[@class="cont"]/p[2]/text()')[5:]

        size = util.get_text(response,
                             '//div[@class="cont"]/p[1]/span/text()')[3:]

        system = util.get_text(response,
                               '//div[@class="cont"]/p[3]/span/text()')[5:]

        text = util.get_text(response,
                             '//div[@class="pictxt item"][not(@style)]', 0)

        download = util.get_text(response, '//span[@id="downnum"]/text()')

        pingfen = util.get_text(response, '//div[@class="score"]/span/text()')
        try:
            pingfen = str(float(pingfen) * 20)
        except Exception:
            pingfen = ''

        tags = ''

        self.fileout.write(source + '\001' + name + '\001' + version + '\001' +
                           category + '\001' + util.unify_data(time) + '\001' +
                           size + '\001' + system + '\001' + text + '\001' +
                           util.unify_download_count(download) + '\001' +
                           pingfen + '\001' + tags)
        self.fileout.write('\n')
Beispiel #20
0
    def parse_item(self, response):

        source = "anzow"

        name = util.get_text(response, "//dl[@class='down_info clear']/dd/div[1]/h1/text()")
        if not name:
            return

        version = ""

        first = util.get_text(response, "//div[@class='crumbs fl']/a[2]/text()")[-2:]
        second = util.get_text(response, "//div[@class='crumbs fl']/a[3]/text()")
        category = first + "-" + second

        time = response.meta["time"]

        size = util.get_text(response, '//div[@class="xiazai1"][1]/../dl/dt/ul/li[3]/text()')

        system = response.meta["system"]

        text = util.get_text(response, '//div[@class="down_intro"]', 0)

        download = response.meta["download"]

        pingfen = util.get_text(response, '//dl[@class="down_info clear"]/dd/dl/dt/ul/li[7]/strong/text()')
        try:
            pingfen = str(pingfen.count("★") * 20)
        except Exception:
            pingfen = ""

        tag = response.xpath('//p[@class="keywords"]//a/text()').extract()
        tags = ",".join(tag)

        self.fileout.write(
            source
            + "\001"
            + name
            + "\001"
            + version
            + "\001"
            + category
            + "\001"
            + util.unify_data(time)
            + "\001"
            + size
            + "\001"
            + system
            + "\001"
            + text
            + "\001"
            + util.unify_download_count(download)
            + "\001"
            + pingfen
            + "\001"
            + tags
        )
        self.fileout.write("\n")
Beispiel #21
0
    def parse_item(self, response):

        source = 'yingyonghui'

        name = util.get_text(response, '//h1[@class="app-name"]/text()')
        if not name:
            return

        version = util.get_text(response, '//div[@class="intro"]/p[1]/text()[2]')[3:]

        first = util.get_text(response, '//div[@class="breadcrumb centre-content"]/a[2]/text()')
        second = util.get_text(response, '//div[@class="breadcrumb centre-content"]/a[3]/text()')
        category = first + '-' + second

        time = util.get_text(response, '//div[@class="intro"]/p[1]/text()')[3:]

        size = util.get_text(response, '//span[@class="app-statistic"]/text()[2]')
        try:
            size = size.split('大小:')[1].split(' 更新')[0]
        except Exception:
            size = ''

        system = util.get_text(response, '//p[@class="art-content"][3]/text()[4]')[3:]

        text = util.get_text(response, '//div[@class="main-info"]/p[1]',0)

        download = util.get_text(response, '//span[@class="app-statistic"]/text()')
        try:
            download = download.split('下载')[0]
        except Exception:
            download = ''

        pingfen = ''

        tags=''

        self.fileout.write(
            source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags
        )
        self.fileout.write('\n')
Beispiel #22
0
    def parse_item(self, response):

        source = '163'

        name = util.get_text(response, '//span[@class="f-h1"]/text()')
        if not name:
            return

        version = util.get_text(
            response, '//table[@class="table-appinfo"]/tr[3]/td/text()')

        first = util.get_text(
            response,
            "//div[@class='sect']/div[@class='crumb']/a[2]/text()")[-2:]
        second = util.get_text(
            response, "//div[@class='sect']/div[@class='crumb']/a[3]/text()")
        category = first + '-' + second

        time = ''

        size = util.get_text(
            response, '//table[@class="table-appinfo"]/tr[2]/td[1]/text()')

        system = ''

        text = util.get_text(response, '//div[@id="app-desc"]', 0)

        download = util.get_text(response,
                                 '//span[@class="vote-text-s"]/text()')[1:-1]

        pingfen = util.get_text(response,
                                '//span[@class="vote-column-s"]/i/@style')
        try:
            pingfen = pingfen.split(':')[1].split('%')[0]
        except Exception:
            pingfen = ''

        tags = ''

        self.fileout.write(source + '\001' + name + '\001' + version + '\001' +
                           category + '\001' + util.unify_data(time) + '\001' +
                           size + '\001' + system + '\001' + text + '\001' +
                           util.unify_download_count(download) + '\001' +
                           pingfen + '\001' + tags)
        self.fileout.write('\n')
Beispiel #23
0
    def parse_item(self, response):

        source = 'hiapk'

        name_and_version = util.get_text(response, "//div[@id='appSoftName']/text()")
        try:
            version = name_and_version.split('(')[1].split(')')[0]
            name = name_and_version.split('(')[0]
        except Exception:
            version = ''
            name = name_and_version
        if not name:
            return

        first = util.get_text(response, "//a[@id='categoryParent']/text()")
        second = util.get_text(response, "//a[@id='categoryLink']/text()")
        category = first + '-' + second

        time = util.get_text(response, '//div[@class="code_box_border"]/div[@class="line_content"][7]/span[2]/text()')

        size = util.get_text(response, '//span[@id="appSize"]/text()')

        system = util.get_text(response, '//span[@class="font14 detailMiniSdk d_gj_line left"]/text()')

        text = util.get_text(response, '//pre[@id="softIntroduce"]',0)

        download = util.get_text(response, '//div[@class="code_box_border"]/div[@class="line_content"][2]/span[2]/text()')

        pingfen = util.get_text(response, '//div[@id="appIconTips"]/div[1]/@class')
        try:
            pingfen = str(float(pingfen.split(" ")[2].split("_")[2])*2)
        except Exception:
            pingfen =''

        tags=''

        self.fileout.write(
            source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags
        )
        self.fileout.write('\n')
Beispiel #24
0
    def parse_item(self, response):

        source = '3310'

        name_version = util.get_text(response, '//div[@class="cont"]/h2/text()')
        if not name_version:
            return
        
        ns = name_version.split(' ')
        version = ns.pop(-1)
        name = ' '.join(ns)

        first = util.get_text(response, '//div[@class="guide"]/a[3]/text()')
        second = util.get_text(response, '//div[@class="guide"]/a[4]/text()')
        category = first + '-' + second

        time = util.get_text(response, '//div[@class="cont"]/p[2]/text()')[5:]

        size = util.get_text(response, '//div[@class="cont"]/p[1]/span/text()')[3:]

        system = util.get_text(response, '//div[@class="cont"]/p[3]/span/text()')[5:]

        text = util.get_text(response, '//div[@class="pictxt item"][not(@style)]',0)

        download = util.get_text(response, '//span[@id="downnum"]/text()')

        pingfen = util.get_text(response, '//div[@class="score"]/span/text()')
        try:
            pingfen = str(float(pingfen)*20)
        except Exception:
            pingfen =''

        tags=''

        self.fileout.write(
            source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags
        )
        self.fileout.write('\n')
Beispiel #25
0
    def parse_item(self, response):

        source = 'apk91'

        name = util.get_text(response, '//h1[@class="ff f20 fb fl"]/text()')
        if not name:
            return

        version = util.get_text(response, '//ul[@class="s_info"]/li[1]/text()')[3:]

        first = util.get_text(response, '//div[@class="crumb clearfix"]/a[2]/text()')
        second = response.meta['cat']
        category = first + '-' + second

        time = util.get_text(response, '//ul[@class="s_info"]/li[5]/text()')[5:15]

        size = util.get_text(response, '//ul[@class="s_info"]/li[3]/text()')[5:]

        system = util.get_text(response, '//ul[@class="s_info"]/li[4]/text()')[5:]

        text = util.get_text(response, '//div[@class="o-content"]',0)

        download = util.get_text(response, '//ul[@class="s_info"]/li[2]/text()')

        pingfen = util.get_text(response, '//div[@class="s_intro_pic fl"]/span[@class="spr star"]/a/@class')
        try:
            pingfen = str(float(pingfen.split('w')[1].split(' ')[0])*20)
        except Exception:
            pingfen =''

        tag = response.xpath('//ul[@class="s_info"]/li[10]/a/text()').extract()
        tags=','.join(tag)

        self.fileout.write(
            source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags
        )
        self.fileout.write('\n')
Beispiel #26
0
    def parse_item(self, response):

        source = '25pp'

        name = util.get_text(response, '//div[@class="title-stat"]/div[@class="txt"]/h1/text()')
        if not name:
            return

        version = util.get_text(response, '//div[@class="title-stat"]/div[@class="txt"]/ul/li[1]/text()')[3:]

        first = util.get_text(response, '//div[@class="location"]/a[2]/text()')
        second = util.get_text(response, '//div[@class="title-stat"]/div[@class="txt"]/ul/li[2]/text()')
        category = first + '-' + second

        time = ''

        size = util.get_text(response, '//div[@class="title-stat"]/div[@class="txt"]/ul/li[3]/text()')[3:]

        system = util.get_text(response, '//div[@class="title-stat"]/div[@class="txt"]/ul/li[5]/text()')[5:]

        text = util.get_text(response, '//div[@class="conTxt"][1]',0)

        download = util.get_text(response, '//li[@class="borderR"]/span/text()')

        pingfen = util.get_text(response, '//div[@class="downMunber"]/ul/li[3]/span/text()')
        try:
            pingfen = str(float(pingfen)*20)
        except Exception:
            pingfen =''

        tag = response.xpath('//li[@class="w-450"]//a/text()').extract()
        tags=','.join(tag)

        self.fileout.write(
            source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags
        )
        self.fileout.write('\n')
Beispiel #27
0
    def parse_item(self, response):

        source = '360'

        name = util.get_text(response, '//h2[@id="app-name"]/span/text()')
        if not name:
            return

        version = util.get_text(response, '//div[@class="breif"]/div[@class="base-info"]/table/tbody/tr[2]/td[1]/text()')

        first = util.get_text(response, '//div[@class="nav"]/ul/li[@class="cur"]/a/text()')[1:]
        second = response.meta['categroy']
        category = first + '-' + second

        time = util.get_text(response, '//div[@class="breif"]/div[@class="base-info"]/table/tbody/tr[1]/td[2]/text()')

        size = util.get_text(response, '//div[@class="pf"]/span[@class="s-3"][2]/text()')

        system = util.get_text(response, '//div[@class="breif"]/div[@class="base-info"]/table/tbody/tr[2]/td[2]/text()')

        text = util.get_text(response, '//div[@class="breif"]',0)

        download = util.get_text(response, '//div[@class="pf"]/span[@class="s-3"][1]/text()')

        pingfen = util.get_text(response, '//div[@class="pf"]/span[@class="s-1 js-votepanel"]/text()')
        try:
            pingfen = str(float(pingfen)*10)
        except Exception:
            pingfen =''

        tag = response.xpath('//div[@class="app-tags"]//a/text()').extract()
        tags=','.join([i.strip() for i in tag])

        self.fileout.write(
            source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags
        )
        self.fileout.write('\n')
Beispiel #28
0
    def parse(self, response):

        source = 'baidu'

        name = util.get_text(response, '//span[@class="gray"]/text()')
        if not name:
            return

        version = util.get_text(response,
                                '//span[@class="version"]/text()')[3:]

        first = util.get_text(response, '//div[@class="nav"]//a/text()')
        second = util.get_text(response,
                               '//div[@class="nav"]/span[3]/a/text()')
        category = first + '-' + second

        time = ''

        size = util.get_text(response, '//span[@class="size"]/text()')[3:]

        system = ''

        text = util.get_text(response, '//div[@class="brief-long"]/p', 0)

        download = util.get_text(response,
                                 '//span[@class="download-num"]/text()')[5:]

        pingfen = util.get_text(response,
                                '//span[@class="star-percent"]/@style')
        try:
            pingfen = pingfen.split(':')[1].split('%')[0]
        except Exception:
            pingfen = ''

        tags = ''

        self.fileout.write(source + '\001' + name + '\001' + version + '\001' +
                           category + '\001' + util.unify_data(time) + '\001' +
                           size + '\001' + system + '\001' + text + '\001' +
                           util.unify_download_count(download) + '\001' +
                           pingfen + '\001' + tags)
        self.fileout.write('\n')
def merge(localPath, remotePath):
    items = {}
    key_order = []

    changes = {}

    for path in [localPath, remotePath]:
        for (li, tvdb_id, sep, alias_list, line) in parse_data(get_text(path)):
            if not validate_tvdb_id(tvdb_id):
                continue

            if not items.has_key(tvdb_id):
                items[tvdb_id] = []
                key_order.append(tvdb_id)

            for alias in alias_list:
                alias = alias.strip().replace("'", "\\'")

                if not find_match(alias, items[tvdb_id]):
                    items[tvdb_id].append(alias)

                    # track remote changes
                    if path == remotePath:
                        if not changes.has_key(tvdb_id):
                            changes[tvdb_id] = []
                        changes[tvdb_id].append(alias)

    print "----------------------------------------------------------"
    print "New Shows"
    print "----------------------------------------------------------"
    for ck, added in changes.items():
        if items[ck] == added:
            print str(ck) + '\tnew\t\t' + str(added)

    print "----------------------------------------------------------"
    print "New Aliases"
    print "----------------------------------------------------------"
    for ck, added in changes.items():
        if items[ck] != added:
            print str(ck) + '\tadd\t\t' + str(added)
            print '=============\t', items[ck]
            print

    return dict_to_data(items, key_order)
def main():
    parser = argparse.ArgumentParser(description="Validate file is in the correct form.")

    parser.add_argument('file', metavar='FILE', type=str,
                        help="LOCAL file to validate.")

    args = parser.parse_args()

    if not os.path.exists(args.file):
        print "ERROR: file", '"' + args.file + '"', "does not exist"
        return

    print "file", '"' + args.file + '"'
    print

    valid = validate(get_text(args.file), trace=True)

    if not valid:
        sys.exit(1)
Beispiel #31
0
def train(input_paths, encoding, output_path):
    """
    :param input_paths: a list of one string representing the input path to a
        metadata file. Each line of the file contains class name and path to
        input file separated by whitespace.
    """
    input_metadata = input_paths[0]

    doc_tokens = []
    doc_labels = []

    class_freqs = collections.defaultdict(int)
    word_freqs = collections.defaultdict(int)
    class_sizes = collections.defaultdict(int)

    vocabulary = set()

    with open(input_metadata, "r") as metadata_file:
        for line in metadata_file:
            doc_label, doc_path = line.strip().split()

            with open(doc_path, "r") as doc_file:
                tokens = util.tokenize(util.get_text(doc_file, encoding))

            doc_labels.append(doc_label)
            doc_tokens.append(tokens)

            class_freqs[doc_label] += 1
            class_sizes[doc_label] += len(doc_tokens)

            for word in tokens:
                vocabulary.add(word)
                word_freqs[(doc_label, word)] += 1

    class_freqs = {
        c: f / float(len(doc_tokens))
        for c, f in class_freqs.iteritems()
    }

    model = NaiveBayesModel(class_freqs, word_freqs, class_sizes, vocabulary)

    with open(output_path, "w") as output_file:
        pickle.dump(model, output_file)
Beispiel #32
0
    def classify(self, input_path, encoding):
        """
        :return: a tuple of (class label, weights) where ``weights`` is the
            list of (class label, weight) tuples
        """

        with open(input_path, "r") as input_file:
            doc_tokens = util.tokenize(util.get_text(input_file, encoding))

        weights = self.doc_in_class_probabilities(doc_tokens,
                                                  self.class_freqs.keys())

        best_weight = None

        for label, weight in weights:
            if best_weight is None or weight > best_weight:
                best_weight = weight
                best_label = label

        return (best_label, weights)
def remove_duplicates(path):
    items = {}
    key_order = []

    changes = {}

    for (li, tvdb_id, sep, alias_list, line) in parse_data(get_text(path)):
        if not validate_tvdb_id(tvdb_id):
            continue

        if not items.has_key(tvdb_id):
            items[tvdb_id] = []
            key_order.append(tvdb_id)

        for alias in alias_list:
            alias = alias.strip().replace("'", "\\'")

            if not find_match(alias, items[tvdb_id]):
                items[tvdb_id].append(alias)

    return dict_to_data(items, key_order)
Beispiel #34
0
    def parse_item(self, response):

        source = 'mumayi'

        name_version = util.get_text(
            response, '//h1[@class="iappname hidden fl"]/text()')
        if not name_version:
            return
        sn = name_version.split('V')
        version = sn.pop(-1) if len(sn) > 1 else ''
        name = 'V'.join(sn) if sn else name_version

        first = util.get_text(response,
                              '//div[@id="classlists"]/a[2]/text()')[:2]
        second = util.get_text(response, '//div[@id="classlists"]/a[3]/text()')
        category = first + '-' + second

        time = response.meta['time']

        size = util.get_text(response, '//span[text()="程序大小:"]/../text()')

        system = util.get_text(response, '//div[@class="sel_text fl"]/text()')

        text = util.get_text(response,
                             '//ul[@class="author"]/..//p[position()<last()]',
                             0)

        download = ''

        pingfen = util.get_text(response, '//div[@id="starlist"]/@class')
        try:
            pingfen = str(float(pingfen.split('now')[1]) * 2)
        except Exception:
            pingfen = ''

        tags = ''

        self.fileout.write(source + '\001' + name + '\001' + version + '\001' +
                           category + '\001' + util.unify_data(time) + '\001' +
                           size + '\001' + system + '\001' + text + '\001' +
                           util.unify_download_count(download) + '\001' +
                           pingfen + '\001' + tags)
        self.fileout.write('\n')
Beispiel #35
0
    def parse_item(self, response):

        source = 'yingyongbao'

        name = response.meta['name']
        if not name:
            return

        version = response.meta['version']

        # first = response.meta['first']
        second = response.meta['category']
        category = response.meta['category']

        t = response.meta['time']
        import time
        try:
            st = time.strftime('%Y-%m-%d', time.localtime(t))
        except Exception:
            st = ''

        size = response.meta['size']
        size = str(size / 1000000) + 'M'

        system = ''

        text = util.get_text(response, '//div[@class="det-intro-text"]', 0)

        download = str(response.meta['appdown'])

        pingfen = str(response.meta['pingfen'])

        tags = ''

        self.fileout.write(source + '\001' + name + '\001' + version + '\001' +
                           category + '\001' + st + '\001' + size + '\001' +
                           system + '\001' + text + '\001' +
                           util.unify_download_count(download) + '\001' +
                           pingfen + '\001' + tags)
        self.fileout.write('\n')
Beispiel #36
0
def parse_bill(scraper, url):
    """Given a bill status URL, return a fully loaded Bill object, except for votes, which
       are expected to be handled externally.
    """
    session = extract_session(url)
    chamber = chamber_for_doctype(extract_doctype(url))
    s = get_soup(scraper, url)
    bill_id = extract_bill_id(s)
    landmark = s(text=re.compile(".*Short Description.*"))
    name_span = landmark[0].findParent().findNextSibling()
    bill_name = get_text(name_span)
    bill = Bill(session, chamber, bill_id, bill_name.strip(),status_url=url)
    actions = extract_actions(s)
    for chamber,action,date in actions:
        bill.add_action(chamber,action,date) #kwargs are permitted if we have 'em.  
    sponsor_dict = extract_sponsors_from_actions([action[1] for action in actions])
    for type,namelist in sponsor_dict.iteritems():
        for name in namelist:
            bill.add_sponsor(type,name)
    for name,link in extract_versions(scraper, s):
        bill.add_version(name,link)
    return bill
Beispiel #37
0
def parse_bill(scraper, url):
    """Given a bill status URL, return a fully loaded Bill object, except for votes, which
       are expected to be handled externally.
    """
    session = extract_session(url)
    chamber = chamber_for_doctype(extract_doctype(url))
    s = get_soup(scraper, url)
    bill_id = extract_bill_id(s)
    landmark = s(text=re.compile(".*Short Description.*"))
    name_span = landmark[0].findParent().findNextSibling()
    bill_name = get_text(name_span)
    bill = Bill(session, chamber, bill_id, bill_name.strip(),status_url=url)
    actions = extract_actions(s)
    for chamber,action,date in actions:
        bill.add_action(chamber,action,date) #kwargs are permitted if we have 'em.
    sponsor_dict = extract_sponsors_from_actions([action[1] for action in actions])
    for type,namelist in sponsor_dict.iteritems():
        for name in namelist:
            bill.add_sponsor(type,name)
    for name,link in extract_versions(scraper, s):
        bill.add_version(name,link)
    return bill
Beispiel #38
0
    def parse_item(self, response):

        source = 'applestore'

        name = util.get_text(response, '//div[@id="desktopContentBlockId"]//div[@id="title"]//h1/text()')
        if not name:
            return

        version = util.get_text(response, '//div[@id="left-stack"]//span[@itemprop="softwareVersion"]/text()')

        first = '软件'
        second = util.get_text(response, '//div[@id="left-stack"]//span[@itemprop="applicationCategory"]/text()')
        category = first + '-' + second

        time = util.get_text(response, '//div[@id="left-stack"]//span[@itemprop="datePublished"]/text()')

        size = ''

        system = util.get_text(response, '//div[@id="left-stack"]//span[@itemprop="operatingSystem"]/text()')

        text = util.get_text(response, '//div[@class="center-stack"]/div[@class="product-review"]/p',0)

        download = util.get_text(response, '//div[@class="extra-list customer-ratings"]/div[4]/span/text()')

        pingfen = util.get_text(response, '//div[@class="extra-list customer-ratings"]/div[4]/@aria-label')
        try:
            pingfen = str(float(pingfen.split('星, ')[0])*20)
        except Exception:
            pingfen =''

        tags=''

        self.fileout.write(
            source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags
        )
        self.fileout.write('\n')
Beispiel #39
0
    def parse(self, response):

        source = 'baidu'

        name = util.get_text(response, '//span[@class="gray"]/text()')
        if not name:
            return

        version = util.get_text(response, '//span[@class="version"]/text()')[3:]

        first = util.get_text(response, '//div[@class="nav"]//a/text()')
        second = util.get_text(response, '//div[@class="nav"]/span[3]/a/text()')
        category = first + '-' + second

        time = ''

        size = util.get_text(response, '//span[@class="size"]/text()')[3:]

        system = ''

        text = util.get_text(response, '//div[@class="brief-long"]/p', 0)

        download = util.get_text(response, '//span[@class="download-num"]/text()')[5:]

        pingfen = util.get_text(response, '//span[@class="star-percent"]/@style')
        try:
            pingfen = pingfen.split(':')[1].split('%')[0]
        except Exception:
            pingfen =''

        tags = ''

        self.fileout.write(
            source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags
        )
        self.fileout.write('\n')
Beispiel #40
0
    def parse_item(self, response):

        source = '163'

        name = util.get_text(response, '//span[@class="f-h1"]/text()')
        if not name:
            return

        version = util.get_text(response, '//table[@class="table-appinfo"]/tr[3]/td/text()')

        first = util.get_text(response, "//div[@class='sect']/div[@class='crumb']/a[2]/text()")[-2:]
        second = util.get_text(response, "//div[@class='sect']/div[@class='crumb']/a[3]/text()")
        category = first + '-' + second

        time = ''

        size = util.get_text(response, '//table[@class="table-appinfo"]/tr[2]/td[1]/text()')

        system = ''

        text = util.get_text(response, '//div[@id="app-desc"]',0)

        download = util.get_text(response, '//span[@class="vote-text-s"]/text()')[1:-1]

        pingfen = util.get_text(response, '//span[@class="vote-column-s"]/i/@style')
        try:
            pingfen = pingfen.split(':')[1].split('%')[0]
        except Exception:
            pingfen =''

        tags = ''

        self.fileout.write(
            source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags
        )
        self.fileout.write('\n')
Beispiel #41
0
    def parse_item(self, response):

        source = 'yingyongbao'

        name = response.meta['name']
        if not name:
            return

        version = response.meta['version']

        # first = response.meta['first']
        second = response.meta['category']
        category = response.meta['category']

        t = response.meta['time']
        import time
        try:
            st = time.strftime('%Y-%m-%d',time.localtime(t))
        except Exception:
            st = ''

        size = response.meta['size']
        size = str(size/1000000)+'M'

        system = ''

        text = util.get_text(response, '//div[@class="det-intro-text"]',0)

        download = str(response.meta['appdown'])

        pingfen = str(response.meta['pingfen'])

        tags=''

        self.fileout.write(
            source + '\001' + name + '\001' + version + '\001' + category + '\001' + st + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags
        )
        self.fileout.write('\n')
Beispiel #42
0
    def parse_item(self, response):

        source = 'anzow'

        name = util.get_text(
            response, "//dl[@class='down_info clear']/dd/div[1]/h1/text()")
        if not name:
            return

        version = ''

        first = util.get_text(response,
                              "//div[@class='crumbs fl']/a[2]/text()")[-2:]
        second = util.get_text(response,
                               "//div[@class='crumbs fl']/a[3]/text()")
        category = first + '-' + second

        time = response.meta['time']

        size = util.get_text(
            response, '//div[@class="xiazai1"][1]/../dl/dt/ul/li[3]/text()')

        system = response.meta['system']

        text = util.get_text(response, '//div[@class="down_intro"]', 0)

        download = response.meta['download']

        pingfen = util.get_text(
            response,
            '//dl[@class="down_info clear"]/dd/dl/dt/ul/li[7]/strong/text()')
        try:
            pingfen = str(pingfen.count('★') * 20)
        except Exception:
            pingfen = ''

        tag = response.xpath('//p[@class="keywords"]//a/text()').extract()
        tags = ','.join(tag)

        self.fileout.write(source + '\001' + name + '\001' + version + '\001' +
                           category + '\001' + util.unify_data(time) + '\001' +
                           size + '\001' + system + '\001' + text + '\001' +
                           util.unify_download_count(download) + '\001' +
                           pingfen + '\001' + tags)
        self.fileout.write('\n')
Beispiel #43
0
    def parse_item(self, response):

        source = 'mumayi'

        name_version = util.get_text(response, '//h1[@class="iappname hidden fl"]/text()')
        if not name_version:
            return
        sn = name_version.split('V')
        version = sn.pop(-1) if len(sn)>1 else ''
        name = 'V'.join(sn) if sn else name_version

        first = util.get_text(response, '//div[@id="classlists"]/a[2]/text()')[:2]
        second = util.get_text(response, '//div[@id="classlists"]/a[3]/text()')
        category = first + '-' + second

        time = response.meta['time']

        size = util.get_text(response, '//span[text()="程序大小:"]/../text()')

        system = util.get_text(response, '//div[@class="sel_text fl"]/text()')

        text = util.get_text(response, '//ul[@class="author"]/..//p[position()<last()]',0)

        download = ''

        pingfen = util.get_text(response, '//div[@id="starlist"]/@class')
        try:
            pingfen = str(float(pingfen.split('now')[1])*2)
        except Exception:
            pingfen =''

        tags = ''

        self.fileout.write(
            source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags
        )
        self.fileout.write('\n')
Beispiel #44
0
    def parse_item(self, response):

        source = 'xiaomi'

        name = util.get_text(response, '//div[@class="intro-titles"]/h3/text()')
        if not name:
            return

        version = util.get_text(response, '//ul[@class=" cf"]/li[4]/text()')

        first = response.meta['first']
        second = util.get_text(response, '//div[@class="bread-crumb"]/ul/li[2]/a/text()')
        category = first + '-' + second

        time = util.get_text(response, '//ul[@class=" cf"]/li[6]/text()')

        size = util.get_text(response, '//ul[@class=" cf"]/li[2]/text()')

        system = ''

        text = util.get_text(response, '//p[@class="pslide"]',0)

        download = ''

        pingfen = util.get_text(response, '//div[@class="star1-empty"]/div/@class')
        try:
            pingfen = str(float(pingfen.split('star1-hover star1-')[1])*10)
        except Exception:
            pingfen =''

        tags=''

        self.fileout.write(
            source + '\001' + name + '\001' + version + '\001' + category + '\001' + util.unify_data(time) + '\001' + size + '\001' + system + '\001' + text + '\001' + util.unify_download_count(download) + '\001' + pingfen + '\001' + tags
        )
        self.fileout.write('\n')
Beispiel #45
0
 def parse(self, response):
     first = util.get_text(response, '//span[@class="last"]/text()')[2:4]
     cats = response.xpath('//ul[@class="clearfix tag-box"]//li/a/span/text()').extract()
     for cat in cats:
         yield scrapy.Request('http://apps.wandoujia.com/api/v1/apps?tag='+cat+'&max=60&start=0&opt_fields=apps.packageName', callback = self.parse_page, meta={'cat':cat,'first':first})
Beispiel #46
0
import torch.nn.functional as F
from util import get_text, create_dicts, on_gpu, get_batches, one_hot_encode, write_file
from model import RNN
import sys

device = on_gpu()

# Declaring the hyperparameters
batch_size = 128
seq_length = 100
n_epochs = 100  # start smaller if you are just testing initial behavior
lr = 1e-3
files = sys.argv[1:] if len(sys.argv) > 1 else ['shakespeare']
filename = '-'.join(files)

text = get_text(files)

chars, int2char, char2int = create_dicts(text)

# Encode the text
data = np.array([char2int[ch] for ch in text])

net = RNN(chars).to(device)
opt = torch.optim.Adam(net.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()


# Declaring the train method
def train(epochs=20, clip=5, val_frac=0.1, print_every=100):
    global data
    net.train()
Beispiel #47
0
"""
https://nlp100.github.io/ja/ch01.html#00-%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%80%86%E9%A0%86
"""
from util import get_text

if __name__ == '__main__':
    print(get_text())
Beispiel #48
0
"""
https://nlp100.github.io/ja/ch03.html#23-%E3%82%BB%E3%82%AF%E3%82%B7%E3%83%A7%E3%83%B3%E6%A7%8B%E9%80%A0
"""
import re

from util import get_text

if __name__ == '__main__':
    sep = "="
    pat = re.compile(r'(==+)(.*)==+')
    text = get_text()
    for match in re.finditer(pat, text):
        print(match.group(0))
        section_sep = match.group(1)
        print(len(section_sep))
Beispiel #49
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import nltk

import util
# Doesn't work very well in french ...
LANGUAGE = "english"

text_file = util.get_text(LANGUAGE)

text_str = text_file.read()
text_str = nltk.word_tokenize(text_str, language=LANGUAGE)

text_str = util.clean_text(text_str, LANGUAGE)

text_tag = nltk.pos_tag(text_str)

nltk_text = nltk.Text(text_str)

print type(nltk_text)

# CHUNKING

sentence = nltk.word_tokenize("Bouteflika is the president of Algeria.")
sentence = nltk.pos_tag(sentence)

# grammar = "Actor: {<DT>?<JJS>*<NNP>+}" # jj  adjectif
# chunk= nltk.RegexpParser(grammar)

# result = chunk.parse(text_tag)
Beispiel #50
0
 def testSimple(self):
     etree = fromstring(ENTRY1)
     self.assertEqual(("text", "third"), util.get_text('title', etree))
     self.assertEqual(("html", "<p>not much</p>"), util.get_text('summary', etree))
     self.assertEqual(("xhtml", u'<p style="color:red" other=\'&amp; and &lt; and "\'>Some stuff</p><i>&lt;</i>.\n  '), util.get_text('content', etree))
Beispiel #51
0
 def parse_page(self, response):
     app_list = response.xpath('//div[@class="topic_before"]/a/@href').extract()
     cat = util.get_text(response, '//div[@class="l_box_title"]/h3/text()')
     for i in app_list:
         yield scrapy.Request('http://apk.91.com'+i, callback=self.parse_item, meta={'cat':cat})
Beispiel #52
0
    def link(self, link, title, text):
        return f"[[{text}:{link}]]"


puki = PukiwikiRenderer()
md = mistune.Markdown(renderer=puki)


def puki(filename, comments):
    with open(filename) as f:
        name = filename.split("/")[-1]
        comment = "\n".join(comments.get(name, ""))
        return md(f.read(
        )) + f"\n//{name} cmt_begin\n{comment}\n#comment\n//{name} cmt_end\n"


print("getting text")
text = util.get_text()

print("merging")
merged = "\n".join([
    puki(filename, text.comments)
    for filename in sorted(glob("./entries/*"), reverse=True)
])

text.write(merged)
print("writing text")
response = text.set_text()
print(response)