Example #1
0
    def setIndex():
        from MyHTMLParser import MyHTMLParser
        import os

        HTMLlist = []
        nobody = [
            "index.html", "index1.html", "index2.html", "index3.html",
            "index4.html", "ru3.html", "ru2.html", "ru1.html", "rut.html",
            "rus.html", "ru.html"
        ]
        for (parent, d, f) in os.walk(NorroenDyrd.mirror):
            for fn in f:
                if fn in nobody:
                    continue
                elif fn.find(".html") == -1:
                    continue
                elif os.path.join(parent, fn) in HTMLlist:
                    continue
                else:
                    HTMLlist.append(os.path.join(parent, fn))
        html = []
        for h in HTMLlist:
            entry = {}
            with open(h, "r", encoding="utf-8") as f:
                html = f.readlines()
            parser = MyHTMLParser()
            for i in html:
                parser.feed(i)
            entry["path"] = h.replace(NorroenDyrd.mirror, NorroenDyrd.base)
            entry["text"] = parser.plaintext
            entry["title"] = parser.title
            NorroenDyrd.index.append(entry)
            del parser
Example #2
0
    def __GetEmailContent__(self, filePath):
        self._myHtmlParserObj = MyHTMLParser()
        emailContent = ""
        with open(filePath, 'r') as handle:
            emailMessage = email.message_from_file(handle)

            emailBody = ""
            if emailMessage.is_multipart():
                for part in emailMessage.walk():

                    if part.get_content_type(
                    ) == "text/html" or part.get_content_type(
                    ) == "text/plain":
                        partPayload = part.get_payload()
                        emailBody = emailBody + ' ' + partPayload
            else:
                if emailMessage.get_content_type(
                ) == "text/html" or emailMessage.get_content_type(
                ) == "text/plain":
                    emailBody = emailMessage.get_payload()

            # Cleaning email content
            emailSubject = ''
            if emailMessage.has_key('subject'):
                emailSubject = self.__CleanEmailContent__(
                    emailMessage['subject'])

            emailContent = self._myHtmlParserObj.GetParsedContentFromHtml(
                emailBody)

            emailContent = str(emailSubject) + " " + str(emailContent)
            emailContent = self.__CleanEmailContent__(emailContent)

            return emailContent
Example #3
0
def create_journey_instructions(steps):
    parser = MyHTMLParser()  # HTML parser for directions API data
    instruct = ""
    for step in steps:
        parser.feed(step['html_instructions'])
        instruct += parser.get_data() + ">>>>>"
    print(instruct)
    return instruct
Example #4
0
 def __init__(self, config):
     self.config = config
     self.ht = myhashtable(config)
     self.htmlparser = MyHTMLParser(self.config, self.ht)
     self.start_batch_processing()
     self.write_file_map()
     self.ht.write_posting_file(term_count)
     self.ht.write_hash_table()
Example #5
0
def table_maker(pd_row):
    #read data from html-like file
    h = MyHTMLParser()
    h.feed(pd_row['data'])
    soup = soupparser()
    p = soup.handle_data(pd_row)
    dividendpershare = [h.type_dividendpershare, h.asofdate_dividendpershare, h.reporttype_dividendpershare,
                        h.period_dividendpershare, h.currency_dividendpershare, p.data_dividendpershare,
                        h.exdate_dividendpershare, h.recorddate_dividendpershare, h.paydate_dividendpershare,
                        h.declarationdate_dividendpershare]
    totalrevenue = [h.type_totalrevenue, h.asofdate_totalrevenue, h.reporttype_totalrevenue,
                    h.period_totalrevenue, h.currency_totalrevenue, p.data_totalrevenue,
                    h.exdate_totalrevenue, h.recorddate_totalrevenue, h.paydate_totalrevenue,
                    h.declarationdate_totalrevenue]
    dividend = [h.type_dividend, h.asofdate_dividend, h.reporttype_dividend,
                h.period_dividend, h.currency_dividend, p.data_dividend,
                h.exdate_dividend, h.recorddate_dividend, h.paydate_dividend,
                h.declarationdate_dividend]
    eps = [h.type_eps, h.asofdate_eps, h.reporttype_eps,
           h.period_eps, h.currency_eps, p.data_eps,
           h.exdate_eps, h.recorddate_eps, h.paydate_eps, h.declarationdate_eps]

    #sort data and make it into a dataframe
    names = ['type', 'asofdate', 'reporttype', 'period', 'currency','data',
             'exdate', 'recorddate', 'paydate', 'declarationdate']
    def make_dataframe(list1):
        dict1 = {names[i]: list1[i] for i in range(10)}
        dataframe1 = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in dict1.items()]))
        dataframe1 = dataframe1.fillna(method='ffill')
        return dataframe1

    dividendpershare_dataframe = make_dataframe(dividendpershare)
    totalrevenue_dataframe = make_dataframe(totalrevenue)
    dividend_dataframe = make_dataframe(dividend)
    eps_dataframe = make_dataframe(eps)

    table1 = pd.concat([dividendpershare_dataframe, totalrevenue_dataframe,
                        dividend_dataframe, eps_dataframe], axis = 0, ignore_index=True)

    reqId1 = [pd_row['reqId']] * len(table1['type'])
    table1['reqId'] = pd.Series(np.array(reqId1), index = table1.index)

    #format each column to put into sql
    table1['type'] = table1['type'].astype(str)
    table1['reporttype'] = table1['reporttype'].astype(str)
    table1['period'] = table1['period'].astype(str)
    table1['asofdate'] = pd.to_datetime(table1['asofdate'])
    table1['exdate'] = pd.to_datetime(table1['exdate'])
    table1['recorddate'] = pd.to_datetime(table1['recorddate'])
    table1['paydate'] = pd.to_datetime(table1['paydate'])
    table1['declarationdate'] = pd.to_datetime(table1['declarationdate'])

    #drop_duplicate line
    table1 = table1.drop_duplicates()

    return table1
Example #6
0
    def dataFetcher(self, jobname=None):
        global masterList
        global masterDictionary
        global jobName
        global prePost
        # MEATHOD OVERLOAING
        if jobname == None:
            mydatafetcher = URLCreator(jobname=self.jobName)
        else:
            mydatafetcher = URLCreator(jobname=jobname)

        contents = mydatafetcher.loadUrl()
        parser = MyHTMLParser()
        root = parser.feed(contents)
        postdependencyList = list(set(parser.postList))
        predependencyList = list(set(parser.preList))
        # print('pre:',predependencyList)
        return predependencyList, postdependencyList
Example #7
0
    def __init__(self, emailDirPath, spamMappingFilePath):
        self._emailDirPath = emailDirPath
        self._spamIdentifierMapFilePath = spamMappingFilePath

        # {'inmail.1' :'Spam', 'inmail.10':'Ham',...........'inmail.200':'Spam'}
        self._emailFileNameToSpamOrHamMap = {}
        self._emailFileNameToSpamOrHamMap = self.__LoadFileNameToSpamOrHamMapping__(
            self._spamIdentifierMapFilePath)

        self._myHtmlParserObj = MyHTMLParser()
        self.fp = "test.txt"

        if os.path.isfile(self.fp):
            os.remove(self.fp)
        with open(self.fp, 'w') as handle:
            handle.write(str(datetime.datetime.now()))
        self.bulkList = []

        self._ESMgrObject = ElasticSearchManager(Resource.INDEX_NAME,
                                                 Resource.TYPE_NAME)
    def crawl(self, depth, frontier):
        if depth > self.maxdepth:
            return

        nextLevelFrontier = list()
        for url in frontier:
            # only parse when the number of crawled pages are not exceeding maximum
            if len(self.crawledlist
                   ) < self.numPages and url not in self.crawledlist:
                # pass in the URL and create the request
                request = req.Request(
                    url,
                    headers={
                        "User-Agent":
                        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
                    })
                try:
                    time.sleep(1)
                    # send the request to the url and get the response
                    data = req.urlopen(request).read().decode("utf-8")
                    parser = MyHTMLParser()
                    parser.feed(data)
                    # Handling Nonetype
                    if self.record(url, depth):
                        self.create_web_file(data, len(self.crawledlist))
                        print(url)
                        print("Finished:", len(self.crawledlist), "files")
                        print("current depth: ", depth)
                        nextLevelFrontier += parser.urls
                # try to catch errors when encounter
                except urllib.error.HTTPError as err:
                    # handling page not found error
                    if err.code == 404:
                        continue
                    else:
                        raise
        self.crawl(depth + 1, nextLevelFrontier)
Example #9
0
# MYHTMLParser
from MyHTMLParser import MyHTMLParser


class MyHTMLParser(HTMLParser):
    def start_tag(self, tag, attrs):
        print("encountered a start tag: ", tag)

    def end_tag(self, tag):
        print("encountered an end tag : ", tag)

    def handel_data(self, data):
        print("encountered some data is : ", data)


parser = MyHTMLParser()

parser.feed = ('<html><head><title>test</title></head>'
               "<body><h1>parse me!</h1></body></html>")
Example #10
0
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
    )
    req.add_header('Referer', 'http://gupiao.jd.com/find/12195')
    req.add_header('Host', 'gupiao.jd.com')
    req.add_header('Origin', 'http://gupiao.jd.com')
    req.add_header(
        'Cookie',
        'TrackID=1zjctpUkfXiPPpd2-FlJw52fq9gkx9v0WGqH_4sECdaGDpJ8D_58Bqx-Bx4HQsVMYTsT5X4AEec9ZtKVXPzJEMA; pinId=EX7C17pLL2_bXrUjzBWQTQ; __jdv=204210054|direct|-|none|-|1531620946230; _jrda=3; sec_flag=e125e94ccd30d095203da363b24adad3; sec_addr=c0a8006c; wlfstk_smdl=uj4fvqhhhqq66p2ddnrgf4vw8a2cggkb; 3AB9D23F7A4B3C9B=XG5I3N4FBWQZLN7HPAC56MKB755NV4K4D6CA6ICAOGCMBJBKMFJPJFYCRFOUFX7YP4IHFLD3YJJESRXWWTFXSHEVFM; __jda=204210054.1495960752486274042302.NaN.1525092662.1531620946.23; __jdb=204210054.10.1495960752486274042302|23.1531620946; __jdc=204210054; __jdu=1495960752486274042302; _jrdb=1531621024187'
    )
    req.add_header('Content-Type',
                   'application/x-www-form-urlencoded; charset=UTF-8')
    response = urllib2.urlopen(req)
    string = response.read().replace("\n", "").replace("\t", "").replace(
        " ", "").replace("%", "")
    # print string
    htmlParser = MyHTMLParser()
    htmlParser.feed(string)

    # 对象转Json
    parserDict = htmlParser.__dict__

    try:
        parserDict.pop('interesting')
        parserDict.pop('lasttag')
        parserDict.pop('lineno')
        parserDict.pop('offset')
        parserDict.pop('cdata_elem')
        parserDict.pop('rawdata')
        parserDict.pop('_HTMLParser__starttag_text')
        parserDict.pop('index')
        # parserDict['buyNumStart'] = "100.00"
Example #11
0
 def parseItem(self, htmlText):
     parser = MyHTMLParser(htmlText, self.requestHandler)
     return parser.getJson()