コード例 #1
0
def NewspaperParser(paras):
    html = paras['html']
    url = paras['url']
    item = {}
    item['parser'] = 'Newspaper'
    try:
        docrsp = doc(html, url)
        config = Config()
        config.fetch_images = False
        first_article = Article(url=url, config=config)
        first_article.download(docrsp.html())
        first_article.parse()
        item['title'] = first_article.title
        pubtime = first_article.publish_date
        if pubtime:
            pubtimeint = int(time.mktime(pubtime.timetuple()))
            item['pdate'] = pubtimeint
        item['content'] = first_article.text
        item['pdate'] = ValidateTime(item['pdate'])
        item['showcontent'] = item['content'].replace("\n", "<br/>")
        return item

    except Exception, e:
        print e
        return item
コード例 #2
0
ファイル: parserlogic.py プロジェクト: zymtech/parse_newspage
def NewspaperParser( paras ):
    html = paras['html']
    url = paras['url']
    item = {}
    item['parser'] = 'Newspaper'
    try:
        docrsp = doc(html,url)
        config = Config()
        config.fetch_images = False
        first_article = Article(url=url, config=config)
        first_article.download(docrsp.html())
        first_article.parse()
        item['title'] = first_article.title
        pubtime = first_article.publish_date
        if pubtime:
            pubtimeint = int(time.mktime(pubtime.timetuple()))
            item['pdate'] = pubtimeint
        item['content'] = first_article.text
        item['pdate'] = ValidateTime( item['pdate'] )
        item['showcontent'] = item['content'].replace("\n", "<br/>")
        return item

    except Exception , e:
        print e
        return item
コード例 #3
0
ファイル: test.py プロジェクト: profmugshot/group17
def test():
	html = urllib2.urlopen('http://www.cs.sfu.ca/people/faculty.html')
	soup = BeautifulSoup(html)
	text = soup.findAll(text=True)
	page = filter(visible, text)
	page = [token.strip(' ').lower() for token in page]
	print rel.check(page)

	a = doc('111','hello','<had')
	print a.getHTML()
コード例 #4
0
ファイル: manual_crawler.py プロジェクト: profmugshot/group17
def crawl():
	profStr = ''' 
http://www.cs.sfu.ca/people/faculty/gregbaker.html http://www.cs.sfu.ca/~ggbaker/ http://www.cs.sfu.ca/people/faculty/bradleybart.html http://www.cs.sfu.ca/~bbart/ http://www.cs.sfu.ca/people/faculty/petraberenbrink.html http://www.cs.sfu.ca/~petra/ http://www.cs.sfu.ca/people/faculty/binaybhattacharya.html http://www.cs.sfu.ca/~binay/ http://www.cs.sfu.ca/people/faculty/andreibulatov.html http://www.cs.sfu.ca/~abulatov/ http://www.cs.sfu.ca/people/faculty/robertdcameron.html http://www.cs.sfu.ca/~cameron/ http://www.cs.sfu.ca/people/faculty/dianacukierman.html http://www.cs.sfu.ca/people/faculty/ryandarcy.html http://www.cs.sfu.ca/%7Everonica/ http://www.cs.sfu.ca/people/faculty/veronicadahl1.html http://www.cs.sfu.ca/~veronica/ http://www.cs.sfu.ca/people/faculty/jamesdelgrande.html http://www.cs.sfu.ca/~jim/ http://www.cs.sfu.ca/people/faculty/tonydixon.html http://www.cs.sfu.ca/people/faculty/tobydonaldson.html http://www.cs.sfu.ca/people/faculty/markdrew.html http://www.cs.sfu.ca/~mark/ http://www.cs.sfu.ca/people/faculty/johnedgar.html http://www.cs.sfu.ca/people/faculty/fundaergun.html http://www.cs.sfu.ca/~funda/ http://www.cs.sfu.ca/people/faculty/martinester.html http://www.cs.sfu.ca/~ester/ http://www.cs.sfu.ca/people/faculty/mikeevans.html http://www.cs.sfu.ca/people/faculty/alexandrafedorova.html http://www.cs.sfu.ca/~fedorova/ http://www.cs.sfu.ca/people/faculty/brianfraser.html http://www.cs.sfu.ca/~bfraser/ http://www.cs.sfu.ca/people/faculty/brianfunt.html http://www.cs.sfu.ca/~funt/ http://www.cs.sfu.ca/people/faculty/uweglasser.html http://www.cs.sfu.ca/~glaesser/ http://www.cs.sfu.ca/people/faculty/QianpingGu.html http://www.cs.sfu.ca/~qgu/ http://www.cs.sfu.ca/people/faculty/louhafer.html http://www.cs.sfu.ca/~lou/ http://www.cs.sfu.ca/people/faculty/ghassanhamarneh.html http://www.cs.sfu.ca/~hamarneh/ http://www.cs.sfu.ca/people/faculty/mohamedhefeeda.html http://www.cs.sfu.ca/~mhefeeda/ http://www.cs.sfu.ca/people/faculty/pavolhell.html http://www.cs.sfu.ca/~pavol/ http://www.cs.sfu.ca/people/faculty/valentinekabanets.html http://www.cs.sfu.ca/~kabanets/ http://www.cs.sfu.ca/people/faculty/harindersinghkhangura.html http://www.cs.sfu.ca/~hskhangu/ http://www.cs.sfu.ca/people/faculty/arthurkirkpatrick.html http://www.cs.sfu.ca/~ted/ http://www.cs.sfu.ca/people/faculty/RameshKrishnamurti.html http://www.cs.sfu.ca/~ramesh/ http://www.cs.sfu.ca/people/faculty/annelavergne.html http://www.cs.sfu.ca/~alavergn/ http://www.cs.sfu.ca/people/faculty/ze-lianli.html http://www.cs.sfu.ca/~li/ http://www.cs.sfu.ca/people/faculty/arthurliestman.html http://www.cs.sfu.ca/~art/ http://www.cs.sfu.ca/people/faculty/jiangchuanliu.html http://www.cs.sfu.ca/~jcliu/ http://www.cs.sfu.ca/people/faculty/Wo-ShunLuk.html http://www.cs.sfu.ca/~woshun/ http://www.cs.sfu.ca/people/faculty/davidmitchell.html http://www.cs.sfu.ca/~mitchell/ http://www.cs.sfu.ca/people/faculty/gregmori.html http://www.cs.sfu.ca/~mori/ http://www.cs.sfu.ca/people/faculty/torstenmoller.html http://www.cs.sfu.ca/~torsten/ http://www.cs.sfu.ca/people/faculty/stevenpearce.html http://www.cs.sfu.ca/~stevenp/ http://www.cs.sfu.ca/people/faculty/jianpei.html http://www.cs.sfu.ca/~jpei/ http://www.cs.sfu.ca/people/faculty/JosephPeters.html http://www.cs.sfu.ca/~peters/ http://www.cs.sfu.ca/people/faculty/fredpopowich.html http://www.cs.sfu.ca/~popowich/ http://www.cs.sfu.ca/people/faculty/janiceregan.html http://www.cs.sfu.ca/~jregan/ http://www.cs.sfu.ca/people/faculty/cenkssahinalp.html http://www.cs.sfu.ca/~cenk/ http://www.cs.sfu.ca/people/faculty/anoopsarkar.html http://www.cs.sfu.ca/~anoop/ http://www.cs.sfu.ca/people/faculty/oliverschulte.html http://www.cs.sfu.ca/~oschulte/ http://www.cs.sfu.ca/people/faculty/thomasshermer.html http://www.cs.sfu.ca/~shermer/ http://www.cs.sfu.ca/people/Faculty/Profile/ashriram.html http://www.cs.sfu.ca/people/faculty/ArrvindhShriraman.html http://www.cs.sfu.ca/~ashriram/ http://www.cs.sfu.ca/people/Faculty/Profile/tamaras.html http://www.cs.sfu.ca/people/faculty/tamarasmyth.html http://www.cs.sfu.ca/~tamaras/ http://www.cs.sfu.ca/people/Faculty/Profile/tardos.html http://www.cs.sfu.ca/people/faculty/gabortardos.html http://www.cs.sfu.ca/~tardos/ http://www.cs.sfu.ca/people/Faculty/Profile/ter.html http://www.cs.sfu.ca/people/faculty/EugeniaTernovska.html http://www.cs.sfu.ca/~ter/ http://www.cs.sfu.ca/people/Faculty/Profile/vaughan.html http://www.cs.sfu.ca/people/faculty/richardvaughan.html http://www.cs.sfu.ca/~vaughan/ http://www.cs.sfu.ca/people/Faculty/Profile/wangk.html http://www.cs.sfu.ca/people/faculty/kewang.html http://www.cs.sfu.ca/~wangk/ http://www.cs.sfu.ca/people/faculty/kaywiese.html http://www.cs.sfu.ca/~wiese/ http://www.cs.sfu.ca/people/faculty/cynthiaxie.html http://www.cs.sfu.ca/people/Faculty/Profile/haoz.html http://www.cs.sfu.ca/people/faculty/richardzhang.html http://www.cs.sfu.ca/~haoz/'''
	seedURL = profStr.split(" ") 
	#seedURL = ['http://www.cs.sfu.ca/people/faculty.html','http://www.cs.sfu.ca/CourseCentral']
	parsedURL = []
	while len(seedURL)!=0:
##		if len(parsedURL)>20:
##			break;
		print '###### start iterate ######'
		url = seedURL.pop(0) #gets first element
		parsedURL.append(url)
		print 'Reading URL %s', url
		print 'Parsing seed URL'
		try:
			html = urllib2.urlopen(url)
		except:
			print 'can\'t open url'
			continue

		soup = BeautifulSoup(html)
		try:
		#	extractedLinks = extractInternalLinks(url, soup)
			pageTitle = clean(soup.title.string)
		except:
			print 'cannot clean page title, likely a PDF, skipping doc'
			continue

		docID = url
		docObj = doc(docID, pageTitle, html)

		try:
			sql = 'insert into docs (docID, title, html) values (%s, %s, %s);'
			cur.execute(sql, (docID, pageTitle, soup) )
		except:
			print 'duplicate entries'
		try:
			db.commit()
		except:
			print 'duplicates. check sql'
		parsedSet= Set(parsedURL)
		seedSet = Set(seedURL)
		#tempSet = Set(extractedLinks)

		#get all urls in tempSet that is not in parsedSet (tempSet - parsedSet)
		#toBeAddedURL = tempSet.difference(parsedSet)

		#seedSet = seedSet.union(toBeAddedURL)
		#seedURL = list(seedSet) #manually crawl seed URLs and end
		print 'length of seed URL ', len(seedURL)
		#print 'length of parsedURL ', len(list(tempSet))

		print '#####ending iteration####'
コード例 #5
0
def modmain():
    t0 = time.ticks()
    try:
        import doc
        blob = doc.doc()  # get the (key, lang) tuple
    except:
        blob = ('welcome', 'chs')
    key = blob[0]
    lang = blob[1]
    print('='*20,'\033[0;45m\033[4m', end='') # green
    print(key, end='')
    print('\033[0m','='*20)  # no color
    hash = compute_hash(key, 100)

    fd = open_helpfile(key)
    s = fd.read(4096)

    while len(s) > 1:
        subLen = len(s)
        ret = doc.search(s, subLen)
        if ret == 1:
            break # found
        elif ret == 2: # link
            blob = doc.doc() # get new key
            key = blob[0]
            fd.close()
            fd = open_helpfile(key)
            print('-->', key)
            fd.seek(0)
        elif ret < 0:            
            fd.seek(ret,1)
        s = fd.read(4096)
        if len(s) == -ret:
            break

    t1 = time.ticks()
    #print('ticks = %d' % (t1 - t0))
    fd.close()
コード例 #6
0
ファイル: crawler.py プロジェクト: profmugshot/group17
def crawl():
	seedURL = ['http://www.cs.sfu.ca/people/faculty.html','http://www.cs.sfu.ca/CourseCentral']
	parsedURL = []
	while len(seedURL)!=0:
##		if len(parsedURL)>20:
##			break;
		print '###### start iterate ######'
		url = seedURL.pop(0) #gets first element
		parsedURL.append(url)
		print 'Reading URL %s', url
		print 'Parsing seed URL'
		try:
			html = urllib2.urlopen(url)
		except:
			print 'can\'t open url'
			continue

		soup = BeautifulSoup(html)
		try:
			extractedLinks = extractInternalLinks(url, soup)
			pageTitle = clean(soup.title.string)
		except:
			print 'cannot clean page title, likely a PDF, skipping doc'
			continue

		docID = url
		docObj = doc(docID, pageTitle, html)

		try:
			sql = 'insert into docs (docID, title, html) values (%s, %s, %s);'
			cur.execute(sql, (docID, pageTitle, soup) )
		except:
			print 'duplicate entries'
		try:
			db.commit()
		except:
			print 'duplicates. check sql'
		parsedSet= Set(parsedURL)
		seedSet = Set(seedURL)
		tempSet = Set(extractedLinks)

		#get all urls in tempSet that is not in parsedSet (tempSet - parsedSet)
		toBeAddedURL = tempSet.difference(parsedSet)

		seedSet = seedSet.union(toBeAddedURL)
		seedURL = list(seedSet)
		print 'length of seed URL ', len(seedURL)
		print 'length of parsedURL ', len(list(tempSet))

		print '#####ending iteration####'
コード例 #7
0
ファイル: parserlogic.py プロジェクト: zymtech/parse_newspage
def ArticleDateParser( paras ):
    html = paras['html']
    url = paras['url']
    parserTable = {}
    item = {}
    try:
        item['parser'] = 'ArticleDate'
        docrsp = doc(html,url)
        pubdate = articleDateExtractor.extractArticlePublishedDate(url, docrsp.html())
        if pubdate:
            item['pdate'] = int(time.mktime(pubdate.timetuple()))
            item['pdate'] = ValidateTime( item['pdate'] )
        return item
    except Exception , e:
        print e
        return item
コード例 #8
0
def ArticleDateParser(paras):
    html = paras['html']
    url = paras['url']
    parserTable = {}
    item = {}
    try:
        item['parser'] = 'ArticleDate'
        docrsp = doc(html, url)
        pubdate = articleDateExtractor.extractArticlePublishedDate(
            url, docrsp.html())
        if pubdate:
            item['pdate'] = int(time.mktime(pubdate.timetuple()))
            item['pdate'] = ValidateTime(item['pdate'])
        return item
    except Exception, e:
        print e
        return item
コード例 #9
0
def CSSParser(paras):
    html = paras['html']
    url = paras['url']
    item = {}
    item['parser'] = 'CSS'
    try:
        url_ext = tldextract.extract(url).domain
        parser = getsafedictvalue(parserTable, url_ext + "/parser", None)
        cnname = getsafedictvalue(parserTable, url_ext + "/name", "")
        if parser is None:
            return item

        linkurl = url
        docrsp = doc(html, url)
        pubtimeint = 0
        pubtimetxt = ''
        for CSS in parser:
            contraw = docrsp(
                CSS["content"]).remove("a").remove("script").remove("style")
            if contraw == None:
                continue
            item['content'] = contraw.text()
            item['title'] = docrsp(CSS["title"]).text()
            pubtimetxt = docrsp(CSS["date"]).text()
            item['pdate'] = parsedate(pubtimetxt)
            if (len(item['content']) > 0) and (len(item['title']) >
                                               0) and (item['pdate'] > 0):
                break

        if contraw:
            cleaner = clean.Cleaner(page_structure=True)
            showcont = cleaner.clean_html(
                contraw.remove_attr('id').remove_attr('class').wrapAll(
                    '<div></div>').html())
            showcont = re.sub(r'id=".*?"|class=".*?"', '', showcont)
            showcont = re.sub(r'[\s+]*?>', '>', showcont)
            showcont = showcont.replace("\n", "").replace("\t", "").replace(
                "<div>", "").replace("</div>", "")
            item['showcontent'] = showcont
            item['pdate'] = ValidateTime(item['pdate'])

        return item
    except Exception, e:
        print e
        return item
コード例 #10
0
ファイル: parserlogic.py プロジェクト: zymtech/parse_newspage
def CXParser( paras ):
    html = paras['html']
    url = paras['url']
    parserTable = {}
    item = {}
    try:
        item['parser'] = 'CX'
        docrsp = doc(html,url)
        cx = cx_extractor()
        test_html = docrsp.html() #cx.getHtml(response.url)
        s = cx.filter_tags(test_html)
        item['content'] = cx.getText(s)
        item['showcontent'] = item['content'].replace("\n", "<br/>")
        item['pdate'] = None
        return item
    except Exception , e:
        print e
        return item
コード例 #11
0
def CXParser(paras):
    html = paras['html']
    url = paras['url']
    parserTable = {}
    item = {}
    try:
        item['parser'] = 'CX'
        docrsp = doc(html, url)
        cx = cx_extractor()
        test_html = docrsp.html()  #cx.getHtml(response.url)
        s = cx.filter_tags(test_html)
        item['content'] = cx.getText(s)
        item['showcontent'] = item['content'].replace("\n", "<br/>")
        item['pdate'] = None
        return item
    except Exception, e:
        print e
        return item
コード例 #12
0
ファイル: parserlogic.py プロジェクト: zymtech/parse_newspage
def CSSParser( paras ):
    html = paras['html']
    url = paras['url']
    item = {}
    item['parser'] = 'CSS'
    try: 
        url_ext = tldextract.extract(url).domain
        parser = getsafedictvalue(parserTable, url_ext+"/parser", None)
        cnname = getsafedictvalue(parserTable, url_ext + "/name", "")
        if parser is None:
            return item

        linkurl = url
        docrsp = doc(html,url)
        pubtimeint = 0
        pubtimetxt=''
        for CSS in parser:
            contraw = docrsp(CSS["content"]).remove("a").remove("script").remove("style")
            if contraw == None:
                continue
            item['content'] = contraw.text()
            item['title'] = docrsp(CSS["title"]).text()
            pubtimetxt = docrsp(CSS["date"]).text()
            item['pdate'] = parsedate(pubtimetxt)
            if (len( item['content']) > 0) and (len(item['title']) > 0) and (item['pdate'] > 0):
                break;
                
        if contraw:
            cleaner = clean.Cleaner(page_structure=True)
            showcont = cleaner.clean_html(contraw.remove_attr('id').remove_attr('class').wrapAll('<div></div>').html())
            showcont = re.sub(r'id=".*?"|class=".*?"', '', showcont)
            showcont = re.sub(r'[\s+]*?>', '>', showcont)
            showcont = showcont.replace("\n", "").replace("\t", "").replace("<div>", "").replace("</div>", "")
            item['showcontent'] = showcont
            item['pdate'] = ValidateTime( item['pdate'] )

        return item
    except Exception , e:
        print e
        return item
コード例 #13
0
    def printList(self,list):
        for doc in list:
                print(doc.getName())

    def printCatalog(self):
        for keyword in self.catalog:
            print(keyword," ",self.catalog[keyword][0])
            for k in self.catalog[keyword][1]:
                print("      [",k[0],",",k[1].getName(),"]")



if __name__ == "__main__":
    myInvertedIndex = invIndx()
    for name in files:
        myInvertedIndex.numberOfDocuments += 1
        newDoc = doc(name)
        with open(name) as file:
            for line in file:
                line = myInvertedIndex.removePunctuation(line)
                fields = line.split(" ")
                for keyword in fields:
                    kwd = keyword.split("\n")
                    kwd = kwd[0].lower()
                    newDoc.add(kwd)
                    myInvertedIndex.add(kwd,newDoc)                 ## TO-DO SEARCH FOR "Lorem"
    #print(myInvertedIndex.printCatalog())
    answer = myInvertedIndex.search("Lorem",4)
    for i in range(0,len(answer)):
        print("{})  {}    {}".format(i+1,answer[i][0].getName(),answer[i][1]))
コード例 #14
0
import doc

docconsole = doc.doc(username="******", plugin=False, live_debug=True)
docconsole.terminalclient()
コード例 #15
0
ファイル: doc_load_slow.py プロジェクト: mfkiwl/CMM_OpenMV
def modmain():
    state = 0
    t0 = time.ticks()
    fd = open('/aia_doc/doc_file.txt', 'r')
    s = fd.read(8192)
    lnNum = 0
    langEndStr = ""
    sHelp = ''
    while len(s) > 1:
        subLen = len(s)
        t = 0  # t is the index of current string segment
        try:
            import doc
            blob = doc.doc()  # get the (key, lang) tuple
        except:
            blob = ('welcome', 'chs')
        key = blob[0]
        lang = blob[1]
        # search in a segment of data
        while True:
            t2 = s.find('\n', t)
            if t2 < 0:
                rem = subLen - t
                if rem > 0:
                    fd.seek(-(subLen - t), 1)
                break
            lineLen = t2 - t
            #-------------------------------------
            if state != 0:
                sLine = s[t:t2 - 1]
            lnNum += 1
            if lnNum % 1000 == 0:
                print('.', end='')
            if state == 0 and s[t:t +
                                5] == '<key>' and s[t + 5:t2 -
                                                    1] == key:  # searching key
                state = 2  # found
            elif state == 2:  # search lang
                if sLine[:6] == '<lang=' and sLine[-1] == '>':
                    foundLang = sLine[6:-1]
                    state = 3
                    if foundLang == lang:
                        if lnNum >= 1000:
                            print('\r')
                        state = 4
                elif sLine[:6] == '<link=' and sLine[-1] == '>':
                    key = sLine[6:-1]
                    print('-->', key)
                    state = 0
                    #restart from doc
                    fd.seek(0)
                    break
            elif state == 3:
                if sLine == '</lang>':
                    state = 2
            elif state == 4:
                if sLine == '</lang>':
                    doc.get(sHelp)
                    break
                print(sLine)
                sHelp = sHelp + sLine + '\n'
            #-------------------------------------
            t += lineLen + 1
        if len(sHelp) > 0:
            break
        s = fd.read(8192)
    t1 = time.ticks()
    print('ticks = %d' % (t1 - t0))
    fd.close()
コード例 #16
0
        depth = int(args[3])
        file_name = args[4]

        db = Db(sqlite3.connect(name + '.db'), Sql())
        db.setup(depth)

        txt = codecs.open(file_name, 'r', 'utf-8').read()
        Parser(name, db, SENTENCE_SEPARATOR, WORD_SEPARATOR).parse(txt)

    elif mode == 'gen':
        count = int(args[3])
        db = Db(sqlite3.connect(name + '.db'), Sql())
        generator = Generator(name, db, Rnd())

        ##for i in range(0, count):
        ##	print(generator.generate(WORD_SEPARATOR))
        config = configparser.ConfigParser()
        config.read("document.config")
        file_size = config.get("document", "filesize").split(",")
        file_name = config.get("document", "filename").split(",")
        file_format = config.get("document", "format")

        if file_format == "txt":
            temptxt = txt(file_size, file_name, generator, WORD_SEPARATOR)
            temptxt.execute()
        elif file_format == "doc":
            doc = doc(file_size, file_name, generator, WORD_SEPARATOR)
            doc.execute()

    else:
        raise ValueError(usage)