def NewspaperParser(paras): html = paras['html'] url = paras['url'] item = {} item['parser'] = 'Newspaper' try: docrsp = doc(html, url) config = Config() config.fetch_images = False first_article = Article(url=url, config=config) first_article.download(docrsp.html()) first_article.parse() item['title'] = first_article.title pubtime = first_article.publish_date if pubtime: pubtimeint = int(time.mktime(pubtime.timetuple())) item['pdate'] = pubtimeint item['content'] = first_article.text item['pdate'] = ValidateTime(item['pdate']) item['showcontent'] = item['content'].replace("\n", "<br/>") return item except Exception, e: print e return item
def NewspaperParser( paras ): html = paras['html'] url = paras['url'] item = {} item['parser'] = 'Newspaper' try: docrsp = doc(html,url) config = Config() config.fetch_images = False first_article = Article(url=url, config=config) first_article.download(docrsp.html()) first_article.parse() item['title'] = first_article.title pubtime = first_article.publish_date if pubtime: pubtimeint = int(time.mktime(pubtime.timetuple())) item['pdate'] = pubtimeint item['content'] = first_article.text item['pdate'] = ValidateTime( item['pdate'] ) item['showcontent'] = item['content'].replace("\n", "<br/>") return item except Exception , e: print e return item
def test(): html = urllib2.urlopen('http://www.cs.sfu.ca/people/faculty.html') soup = BeautifulSoup(html) text = soup.findAll(text=True) page = filter(visible, text) page = [token.strip(' ').lower() for token in page] print rel.check(page) a = doc('111','hello','<had') print a.getHTML()
def crawl(): profStr = ''' http://www.cs.sfu.ca/people/faculty/gregbaker.html http://www.cs.sfu.ca/~ggbaker/ http://www.cs.sfu.ca/people/faculty/bradleybart.html http://www.cs.sfu.ca/~bbart/ http://www.cs.sfu.ca/people/faculty/petraberenbrink.html http://www.cs.sfu.ca/~petra/ http://www.cs.sfu.ca/people/faculty/binaybhattacharya.html http://www.cs.sfu.ca/~binay/ http://www.cs.sfu.ca/people/faculty/andreibulatov.html http://www.cs.sfu.ca/~abulatov/ http://www.cs.sfu.ca/people/faculty/robertdcameron.html http://www.cs.sfu.ca/~cameron/ http://www.cs.sfu.ca/people/faculty/dianacukierman.html http://www.cs.sfu.ca/people/faculty/ryandarcy.html http://www.cs.sfu.ca/%7Everonica/ http://www.cs.sfu.ca/people/faculty/veronicadahl1.html http://www.cs.sfu.ca/~veronica/ http://www.cs.sfu.ca/people/faculty/jamesdelgrande.html http://www.cs.sfu.ca/~jim/ http://www.cs.sfu.ca/people/faculty/tonydixon.html http://www.cs.sfu.ca/people/faculty/tobydonaldson.html http://www.cs.sfu.ca/people/faculty/markdrew.html http://www.cs.sfu.ca/~mark/ http://www.cs.sfu.ca/people/faculty/johnedgar.html http://www.cs.sfu.ca/people/faculty/fundaergun.html http://www.cs.sfu.ca/~funda/ http://www.cs.sfu.ca/people/faculty/martinester.html http://www.cs.sfu.ca/~ester/ http://www.cs.sfu.ca/people/faculty/mikeevans.html http://www.cs.sfu.ca/people/faculty/alexandrafedorova.html http://www.cs.sfu.ca/~fedorova/ http://www.cs.sfu.ca/people/faculty/brianfraser.html http://www.cs.sfu.ca/~bfraser/ http://www.cs.sfu.ca/people/faculty/brianfunt.html http://www.cs.sfu.ca/~funt/ http://www.cs.sfu.ca/people/faculty/uweglasser.html http://www.cs.sfu.ca/~glaesser/ http://www.cs.sfu.ca/people/faculty/QianpingGu.html http://www.cs.sfu.ca/~qgu/ http://www.cs.sfu.ca/people/faculty/louhafer.html http://www.cs.sfu.ca/~lou/ http://www.cs.sfu.ca/people/faculty/ghassanhamarneh.html http://www.cs.sfu.ca/~hamarneh/ http://www.cs.sfu.ca/people/faculty/mohamedhefeeda.html http://www.cs.sfu.ca/~mhefeeda/ http://www.cs.sfu.ca/people/faculty/pavolhell.html http://www.cs.sfu.ca/~pavol/ http://www.cs.sfu.ca/people/faculty/valentinekabanets.html http://www.cs.sfu.ca/~kabanets/ http://www.cs.sfu.ca/people/faculty/harindersinghkhangura.html http://www.cs.sfu.ca/~hskhangu/ http://www.cs.sfu.ca/people/faculty/arthurkirkpatrick.html http://www.cs.sfu.ca/~ted/ http://www.cs.sfu.ca/people/faculty/RameshKrishnamurti.html http://www.cs.sfu.ca/~ramesh/ http://www.cs.sfu.ca/people/faculty/annelavergne.html http://www.cs.sfu.ca/~alavergn/ http://www.cs.sfu.ca/people/faculty/ze-lianli.html http://www.cs.sfu.ca/~li/ http://www.cs.sfu.ca/people/faculty/arthurliestman.html http://www.cs.sfu.ca/~art/ http://www.cs.sfu.ca/people/faculty/jiangchuanliu.html http://www.cs.sfu.ca/~jcliu/ http://www.cs.sfu.ca/people/faculty/Wo-ShunLuk.html http://www.cs.sfu.ca/~woshun/ http://www.cs.sfu.ca/people/faculty/davidmitchell.html http://www.cs.sfu.ca/~mitchell/ http://www.cs.sfu.ca/people/faculty/gregmori.html http://www.cs.sfu.ca/~mori/ http://www.cs.sfu.ca/people/faculty/torstenmoller.html http://www.cs.sfu.ca/~torsten/ http://www.cs.sfu.ca/people/faculty/stevenpearce.html http://www.cs.sfu.ca/~stevenp/ http://www.cs.sfu.ca/people/faculty/jianpei.html http://www.cs.sfu.ca/~jpei/ http://www.cs.sfu.ca/people/faculty/JosephPeters.html http://www.cs.sfu.ca/~peters/ http://www.cs.sfu.ca/people/faculty/fredpopowich.html http://www.cs.sfu.ca/~popowich/ http://www.cs.sfu.ca/people/faculty/janiceregan.html http://www.cs.sfu.ca/~jregan/ http://www.cs.sfu.ca/people/faculty/cenkssahinalp.html http://www.cs.sfu.ca/~cenk/ http://www.cs.sfu.ca/people/faculty/anoopsarkar.html http://www.cs.sfu.ca/~anoop/ http://www.cs.sfu.ca/people/faculty/oliverschulte.html http://www.cs.sfu.ca/~oschulte/ http://www.cs.sfu.ca/people/faculty/thomasshermer.html http://www.cs.sfu.ca/~shermer/ http://www.cs.sfu.ca/people/Faculty/Profile/ashriram.html http://www.cs.sfu.ca/people/faculty/ArrvindhShriraman.html http://www.cs.sfu.ca/~ashriram/ http://www.cs.sfu.ca/people/Faculty/Profile/tamaras.html http://www.cs.sfu.ca/people/faculty/tamarasmyth.html http://www.cs.sfu.ca/~tamaras/ http://www.cs.sfu.ca/people/Faculty/Profile/tardos.html http://www.cs.sfu.ca/people/faculty/gabortardos.html http://www.cs.sfu.ca/~tardos/ http://www.cs.sfu.ca/people/Faculty/Profile/ter.html http://www.cs.sfu.ca/people/faculty/EugeniaTernovska.html http://www.cs.sfu.ca/~ter/ http://www.cs.sfu.ca/people/Faculty/Profile/vaughan.html http://www.cs.sfu.ca/people/faculty/richardvaughan.html http://www.cs.sfu.ca/~vaughan/ http://www.cs.sfu.ca/people/Faculty/Profile/wangk.html http://www.cs.sfu.ca/people/faculty/kewang.html http://www.cs.sfu.ca/~wangk/ http://www.cs.sfu.ca/people/faculty/kaywiese.html http://www.cs.sfu.ca/~wiese/ http://www.cs.sfu.ca/people/faculty/cynthiaxie.html http://www.cs.sfu.ca/people/Faculty/Profile/haoz.html http://www.cs.sfu.ca/people/faculty/richardzhang.html http://www.cs.sfu.ca/~haoz/''' seedURL = profStr.split(" ") #seedURL = ['http://www.cs.sfu.ca/people/faculty.html','http://www.cs.sfu.ca/CourseCentral'] parsedURL = [] while len(seedURL)!=0: ## if len(parsedURL)>20: ## break; print '###### start iterate ######' url = seedURL.pop(0) #gets first element parsedURL.append(url) print 'Reading URL %s', url print 'Parsing seed URL' try: html = urllib2.urlopen(url) except: print 'can\'t open url' continue soup = BeautifulSoup(html) try: # extractedLinks = extractInternalLinks(url, soup) pageTitle = clean(soup.title.string) except: print 'cannot clean page title, likely a PDF, skipping doc' continue docID = url docObj = doc(docID, pageTitle, html) try: sql = 'insert into docs (docID, title, html) values (%s, %s, %s);' cur.execute(sql, (docID, pageTitle, soup) ) except: print 'duplicate entries' try: db.commit() except: print 'duplicates. check sql' parsedSet= Set(parsedURL) seedSet = Set(seedURL) #tempSet = Set(extractedLinks) #get all urls in tempSet that is not in parsedSet (tempSet - parsedSet) #toBeAddedURL = tempSet.difference(parsedSet) #seedSet = seedSet.union(toBeAddedURL) #seedURL = list(seedSet) #manually crawl seed URLs and end print 'length of seed URL ', len(seedURL) #print 'length of parsedURL ', len(list(tempSet)) print '#####ending iteration####'
def modmain(): t0 = time.ticks() try: import doc blob = doc.doc() # get the (key, lang) tuple except: blob = ('welcome', 'chs') key = blob[0] lang = blob[1] print('='*20,'\033[0;45m\033[4m', end='') # green print(key, end='') print('\033[0m','='*20) # no color hash = compute_hash(key, 100) fd = open_helpfile(key) s = fd.read(4096) while len(s) > 1: subLen = len(s) ret = doc.search(s, subLen) if ret == 1: break # found elif ret == 2: # link blob = doc.doc() # get new key key = blob[0] fd.close() fd = open_helpfile(key) print('-->', key) fd.seek(0) elif ret < 0: fd.seek(ret,1) s = fd.read(4096) if len(s) == -ret: break t1 = time.ticks() #print('ticks = %d' % (t1 - t0)) fd.close()
def crawl(): seedURL = ['http://www.cs.sfu.ca/people/faculty.html','http://www.cs.sfu.ca/CourseCentral'] parsedURL = [] while len(seedURL)!=0: ## if len(parsedURL)>20: ## break; print '###### start iterate ######' url = seedURL.pop(0) #gets first element parsedURL.append(url) print 'Reading URL %s', url print 'Parsing seed URL' try: html = urllib2.urlopen(url) except: print 'can\'t open url' continue soup = BeautifulSoup(html) try: extractedLinks = extractInternalLinks(url, soup) pageTitle = clean(soup.title.string) except: print 'cannot clean page title, likely a PDF, skipping doc' continue docID = url docObj = doc(docID, pageTitle, html) try: sql = 'insert into docs (docID, title, html) values (%s, %s, %s);' cur.execute(sql, (docID, pageTitle, soup) ) except: print 'duplicate entries' try: db.commit() except: print 'duplicates. check sql' parsedSet= Set(parsedURL) seedSet = Set(seedURL) tempSet = Set(extractedLinks) #get all urls in tempSet that is not in parsedSet (tempSet - parsedSet) toBeAddedURL = tempSet.difference(parsedSet) seedSet = seedSet.union(toBeAddedURL) seedURL = list(seedSet) print 'length of seed URL ', len(seedURL) print 'length of parsedURL ', len(list(tempSet)) print '#####ending iteration####'
def ArticleDateParser( paras ): html = paras['html'] url = paras['url'] parserTable = {} item = {} try: item['parser'] = 'ArticleDate' docrsp = doc(html,url) pubdate = articleDateExtractor.extractArticlePublishedDate(url, docrsp.html()) if pubdate: item['pdate'] = int(time.mktime(pubdate.timetuple())) item['pdate'] = ValidateTime( item['pdate'] ) return item except Exception , e: print e return item
def ArticleDateParser(paras): html = paras['html'] url = paras['url'] parserTable = {} item = {} try: item['parser'] = 'ArticleDate' docrsp = doc(html, url) pubdate = articleDateExtractor.extractArticlePublishedDate( url, docrsp.html()) if pubdate: item['pdate'] = int(time.mktime(pubdate.timetuple())) item['pdate'] = ValidateTime(item['pdate']) return item except Exception, e: print e return item
def CSSParser(paras): html = paras['html'] url = paras['url'] item = {} item['parser'] = 'CSS' try: url_ext = tldextract.extract(url).domain parser = getsafedictvalue(parserTable, url_ext + "/parser", None) cnname = getsafedictvalue(parserTable, url_ext + "/name", "") if parser is None: return item linkurl = url docrsp = doc(html, url) pubtimeint = 0 pubtimetxt = '' for CSS in parser: contraw = docrsp( CSS["content"]).remove("a").remove("script").remove("style") if contraw == None: continue item['content'] = contraw.text() item['title'] = docrsp(CSS["title"]).text() pubtimetxt = docrsp(CSS["date"]).text() item['pdate'] = parsedate(pubtimetxt) if (len(item['content']) > 0) and (len(item['title']) > 0) and (item['pdate'] > 0): break if contraw: cleaner = clean.Cleaner(page_structure=True) showcont = cleaner.clean_html( contraw.remove_attr('id').remove_attr('class').wrapAll( '<div></div>').html()) showcont = re.sub(r'id=".*?"|class=".*?"', '', showcont) showcont = re.sub(r'[\s+]*?>', '>', showcont) showcont = showcont.replace("\n", "").replace("\t", "").replace( "<div>", "").replace("</div>", "") item['showcontent'] = showcont item['pdate'] = ValidateTime(item['pdate']) return item except Exception, e: print e return item
def CXParser( paras ): html = paras['html'] url = paras['url'] parserTable = {} item = {} try: item['parser'] = 'CX' docrsp = doc(html,url) cx = cx_extractor() test_html = docrsp.html() #cx.getHtml(response.url) s = cx.filter_tags(test_html) item['content'] = cx.getText(s) item['showcontent'] = item['content'].replace("\n", "<br/>") item['pdate'] = None return item except Exception , e: print e return item
def CXParser(paras): html = paras['html'] url = paras['url'] parserTable = {} item = {} try: item['parser'] = 'CX' docrsp = doc(html, url) cx = cx_extractor() test_html = docrsp.html() #cx.getHtml(response.url) s = cx.filter_tags(test_html) item['content'] = cx.getText(s) item['showcontent'] = item['content'].replace("\n", "<br/>") item['pdate'] = None return item except Exception, e: print e return item
def CSSParser( paras ): html = paras['html'] url = paras['url'] item = {} item['parser'] = 'CSS' try: url_ext = tldextract.extract(url).domain parser = getsafedictvalue(parserTable, url_ext+"/parser", None) cnname = getsafedictvalue(parserTable, url_ext + "/name", "") if parser is None: return item linkurl = url docrsp = doc(html,url) pubtimeint = 0 pubtimetxt='' for CSS in parser: contraw = docrsp(CSS["content"]).remove("a").remove("script").remove("style") if contraw == None: continue item['content'] = contraw.text() item['title'] = docrsp(CSS["title"]).text() pubtimetxt = docrsp(CSS["date"]).text() item['pdate'] = parsedate(pubtimetxt) if (len( item['content']) > 0) and (len(item['title']) > 0) and (item['pdate'] > 0): break; if contraw: cleaner = clean.Cleaner(page_structure=True) showcont = cleaner.clean_html(contraw.remove_attr('id').remove_attr('class').wrapAll('<div></div>').html()) showcont = re.sub(r'id=".*?"|class=".*?"', '', showcont) showcont = re.sub(r'[\s+]*?>', '>', showcont) showcont = showcont.replace("\n", "").replace("\t", "").replace("<div>", "").replace("</div>", "") item['showcontent'] = showcont item['pdate'] = ValidateTime( item['pdate'] ) return item except Exception , e: print e return item
def printList(self,list): for doc in list: print(doc.getName()) def printCatalog(self): for keyword in self.catalog: print(keyword," ",self.catalog[keyword][0]) for k in self.catalog[keyword][1]: print(" [",k[0],",",k[1].getName(),"]") if __name__ == "__main__": myInvertedIndex = invIndx() for name in files: myInvertedIndex.numberOfDocuments += 1 newDoc = doc(name) with open(name) as file: for line in file: line = myInvertedIndex.removePunctuation(line) fields = line.split(" ") for keyword in fields: kwd = keyword.split("\n") kwd = kwd[0].lower() newDoc.add(kwd) myInvertedIndex.add(kwd,newDoc) ## TO-DO SEARCH FOR "Lorem" #print(myInvertedIndex.printCatalog()) answer = myInvertedIndex.search("Lorem",4) for i in range(0,len(answer)): print("{}) {} {}".format(i+1,answer[i][0].getName(),answer[i][1]))
import doc docconsole = doc.doc(username="******", plugin=False, live_debug=True) docconsole.terminalclient()
def modmain(): state = 0 t0 = time.ticks() fd = open('/aia_doc/doc_file.txt', 'r') s = fd.read(8192) lnNum = 0 langEndStr = "" sHelp = '' while len(s) > 1: subLen = len(s) t = 0 # t is the index of current string segment try: import doc blob = doc.doc() # get the (key, lang) tuple except: blob = ('welcome', 'chs') key = blob[0] lang = blob[1] # search in a segment of data while True: t2 = s.find('\n', t) if t2 < 0: rem = subLen - t if rem > 0: fd.seek(-(subLen - t), 1) break lineLen = t2 - t #------------------------------------- if state != 0: sLine = s[t:t2 - 1] lnNum += 1 if lnNum % 1000 == 0: print('.', end='') if state == 0 and s[t:t + 5] == '<key>' and s[t + 5:t2 - 1] == key: # searching key state = 2 # found elif state == 2: # search lang if sLine[:6] == '<lang=' and sLine[-1] == '>': foundLang = sLine[6:-1] state = 3 if foundLang == lang: if lnNum >= 1000: print('\r') state = 4 elif sLine[:6] == '<link=' and sLine[-1] == '>': key = sLine[6:-1] print('-->', key) state = 0 #restart from doc fd.seek(0) break elif state == 3: if sLine == '</lang>': state = 2 elif state == 4: if sLine == '</lang>': doc.get(sHelp) break print(sLine) sHelp = sHelp + sLine + '\n' #------------------------------------- t += lineLen + 1 if len(sHelp) > 0: break s = fd.read(8192) t1 = time.ticks() print('ticks = %d' % (t1 - t0)) fd.close()
depth = int(args[3]) file_name = args[4] db = Db(sqlite3.connect(name + '.db'), Sql()) db.setup(depth) txt = codecs.open(file_name, 'r', 'utf-8').read() Parser(name, db, SENTENCE_SEPARATOR, WORD_SEPARATOR).parse(txt) elif mode == 'gen': count = int(args[3]) db = Db(sqlite3.connect(name + '.db'), Sql()) generator = Generator(name, db, Rnd()) ##for i in range(0, count): ## print(generator.generate(WORD_SEPARATOR)) config = configparser.ConfigParser() config.read("document.config") file_size = config.get("document", "filesize").split(",") file_name = config.get("document", "filename").split(",") file_format = config.get("document", "format") if file_format == "txt": temptxt = txt(file_size, file_name, generator, WORD_SEPARATOR) temptxt.execute() elif file_format == "doc": doc = doc(file_size, file_name, generator, WORD_SEPARATOR) doc.execute() else: raise ValueError(usage)