def parseALump(lumpnum): lumplocation = LUMP_DIR + '/enwiki_lumped_' + str(lumpnum) + '.xml' subresultlocation = SUBRESULT_DIR + '/' + str(lumpnum) + '.json' page_parser.parseWithCallback(lumplocation, findOCLCNums) logging.info('%s , got to page_parser', str(lumpnum)) logging.info('%s , had an oclcNumDict of len', oclcNumDict) oclcNumJSON = open(subresultlocation, 'w') logging.info('%s , got to open %s', str(lumpnum), subresultlocation) json.dump(oclcNumDict, oclcNumJSON, indent=4) logging.info('%s , got to jsondump %s', str(lumpnum), oclcNumJSON) oclcNumJSON.close() logging.info('%s , got to close the JSON dump', str(lumpnum))
import sys import page_parser as parser numPages = 0 def countPage(page): global numPages numPages += 1 if numPages % 1000 == 0: print "counted %d" % numPages if __name__ == '__main__': parser.parseWithCallback(sys.argv[1], countPage) print "numPages = %d" % numPages
if freq > minFreq: f.write('%s, %d\n' % (word, freq)) f.close() def printPage(page): global numPages numPages += 1 if numPages % 100 == 0: print "parsed %d" % numPages dumpWordFreqs() #TEMP if maxPages > 0 and numPages >= maxPages: raise Exception("done") text = page.text.encode('utf_8') words = ''.join((c.lower() if c.isalpha() else ' ') for c in text).split() for word in words: if not word in word2freq: word2freq[word] = 0 word2freq[word] += 1 if __name__ == '__main__': if len(sys.argv) > 2: maxPages = int(sys.argv[2]) if len(sys.argv) > 3: minFreq = int(sys.argv[3]) parser.parseWithCallback(sys.argv[1], printPage)
#Find all of the links. If the page is in the links, print it links = linksPattern.findall(content.text) for page in pages: if page['title'] in links: page['referenceCount'] += 1 if __name__ == "__main__": with codecs.open('out.txt', encoding='utf-8') as pagesFile: pages = json.load(pagesFile) print 'Done loading json' print 'loaded %d pages' % len(pages) for page in pages: page['referenceCount'] = 0 try: start = time.time() page_parser.parseWithCallback(sys.argv[1], contentHandler) except KeyboardInterrupt: pass finally: with codecs.open('eventswithreferences.json', 'w', encoding='utf-8') as outFile: try: for page in pages: print page['title'], page['referenceCount'] outFile.write(json.dumps(pages)) print "left off at: ", lastContentId except: pdb.set_trace()
for word, freq in word2freq.iteritems(): if freq > minFreq: f.write('%s, %d\n' % (word, freq)) f.close() def printPage(page): global numPages numPages += 1 if numPages % 100 == 0: print "parsed %d" % numPages dumpWordFreqs() #TEMP if maxPages > 0 and numPages >= maxPages: raise Exception("done") text = page.text.encode('utf_8') words = ''.join( (c.lower() if c.isalpha() else ' ') for c in text ).split() for word in words: if not word in word2freq: word2freq[word] = 0 word2freq[word] += 1 if __name__ == '__main__': if len(sys.argv) > 2: maxPages = int(sys.argv[2]) if len(sys.argv) > 3: minFreq = int(sys.argv[3]) parser.parseWithCallback(sys.argv[1], printPage)
def data_gathering(self): page_parser.parseWithCallback( "hewikisource-20120628-pages-articles.xml", processPage)
def processPage(page): if page.title.find('קטגוריה') != 0: print "----- " + page.title + " ----" print page.text # if page.title.find('תבנית') ==0: # print "----- "+page.title+" ----" # print page.text # print category_links(page.text) # print "-------------------------------------------------------------------------" class DataGathering(object): def __init__(self, N=200, M=2): self.articals = [] self.data_gathering() def data_gathering(self): page_parser.parseWithCallback( "hewikisource-20120628-pages-articles.xml", processPage) def _getArticals(self): return self.articals page_parser.parseWithCallback("hewikisource-20120628-pages-articles.xml", processPage)
print content.id, time.strftime('%H:%M:%S', time.gmtime(time.time() - start)) #Find all of the links. If the page is in the links, print it links = linksPattern.findall(content.text) for page in pages: if page['title'] in links: page['referenceCount']+=1 if __name__ == "__main__": with codecs.open('out.txt',encoding='utf-8') as pagesFile: pages = json.load(pagesFile) print 'Done loading json' print 'loaded %d pages'%len(pages) for page in pages: page['referenceCount'] = 0 try: start = time.time() page_parser.parseWithCallback(sys.argv[1], contentHandler) except KeyboardInterrupt: pass finally: with codecs.open('eventswithreferences.json','w', encoding='utf-8') as outFile: try: for page in pages: print page['title'],page['referenceCount'] outFile.write(json.dumps(pages)) print "left off at: ",lastContentId except: pdb.set_trace()
def data_gathering(self): page_parser.parseWithCallback("hewikisource-20120628-pages-articles.xml", processPage)
#def category_links(text): # if re.match(r"INSERT INTO `categorylinks` VALUES", text): # return re.findall(r"\((\d+),'([^']+)'(?:,'[^']*'){5}\)", text) def processPage(page): if page.title.find('קטגוריה') != 0: print "----- "+page.title+" ----" print page.text # if page.title.find('תבנית') ==0: # print "----- "+page.title+" ----" # print page.text # print category_links(page.text) # print "-------------------------------------------------------------------------" class DataGathering(object): def __init__(self,N = 200, M = 2): self.articals = [] self.data_gathering() def data_gathering(self): page_parser.parseWithCallback("hewikisource-20120628-pages-articles.xml", processPage) def _getArticals(self): return self.articals page_parser.parseWithCallback("hewikisource-20120628-pages-articles.xml", processPage)
pass def processPageForCoords(page): page.coords = [] for s in coordsPattern.findall(page.text): try: page.coords.append(Coords(s)) except (ValueError): dataSource.saveInvalidCoordPage(page, s) def processPage(page): """ We're interested in pages representing years with event descriptions, and those which mention any sort of geographic coordinates. """ if isYearPattern.match(page.title): processYear(page) for event in page.events: dataSource.saveEvent(event) else: processPageForCoords(page) if page.coords: dataSource.savePage(page) if __name__ == "__main__": dataSource = DataSource() page_parser.parseWithCallback(sys.argv[1], processPage) print print "Done parsing: ", sys.argv[1] outputs = (DataSource.EVENTS_SAVED, DataSource.COORD_PAGES, DataSource.INVALID_COORD_PAGES) print "Events: %d; Pages with coords: %d; Invalid coords: %d" % outputs
page.coords = [] for s in coordsPattern.findall(page.text): try: page.coords.append(Coords(s)) except (ValueError): dataSource.saveInvalidCoordPage(page, s) def processPage(page): """ We're interested in pages representing years with event descriptions, and those which mention any sort of geographic coordinates. """ if isYearPattern.match(page.title): processYear(page) for event in page.events: dataSource.saveEvent(event) else: processPageForCoords(page) if page.coords: dataSource.savePage(page) if __name__ == "__main__": dataSource = DataSource() page_parser.parseWithCallback(sys.argv[1], processPage) print print "Done parsing: ", sys.argv[1] outputs = (DataSource.EVENTS_SAVED, DataSource.COORD_PAGES, DataSource.INVALID_COORD_PAGES) print "Events: %d; Pages with coords: %d; Invalid coords: %d" % outputs