Ejemplo n.º 1
0
def parseALump(lumpnum):
    lumplocation = LUMP_DIR + '/enwiki_lumped_' + str(lumpnum) + '.xml'
    subresultlocation = SUBRESULT_DIR + '/' + str(lumpnum) + '.json'
    page_parser.parseWithCallback(lumplocation, findOCLCNums)
    logging.info('%s , got to page_parser', str(lumpnum))
    logging.info('%s , had an oclcNumDict of len', oclcNumDict)
    oclcNumJSON = open(subresultlocation, 'w')
    logging.info('%s , got to open %s', str(lumpnum), subresultlocation)
    json.dump(oclcNumDict, oclcNumJSON, indent=4)
    logging.info('%s , got to jsondump %s', str(lumpnum), oclcNumJSON)
    oclcNumJSON.close()
    logging.info('%s , got to close the JSON dump', str(lumpnum))
Ejemplo n.º 2
0
def parseALump(lumpnum):
    lumplocation = LUMP_DIR + '/enwiki_lumped_' + str(lumpnum) + '.xml'
    subresultlocation = SUBRESULT_DIR + '/' + str(lumpnum) + '.json'
    page_parser.parseWithCallback(lumplocation, findOCLCNums)
    logging.info('%s , got to page_parser', str(lumpnum))
    logging.info('%s , had an oclcNumDict of len', oclcNumDict)
    oclcNumJSON = open(subresultlocation, 'w')
    logging.info('%s , got to open %s', str(lumpnum), subresultlocation)
    json.dump(oclcNumDict, oclcNumJSON, indent=4)
    logging.info('%s , got to jsondump %s', str(lumpnum), oclcNumJSON)
    oclcNumJSON.close()
    logging.info('%s , got to close the JSON dump', str(lumpnum))
Ejemplo n.º 3
0
import sys
import page_parser as parser

numPages = 0

def countPage(page):
    global numPages

    numPages += 1
    if numPages % 1000 == 0:
        print "counted %d" % numPages

if __name__ == '__main__':
    parser.parseWithCallback(sys.argv[1], countPage)
    print "numPages = %d" % numPages

Ejemplo n.º 4
0
        if freq > minFreq:
            f.write('%s, %d\n' % (word, freq))
    f.close()


def printPage(page):
    global numPages

    numPages += 1
    if numPages % 100 == 0:
        print "parsed %d" % numPages
        dumpWordFreqs()
    #TEMP
    if maxPages > 0 and numPages >= maxPages:
        raise Exception("done")
    text = page.text.encode('utf_8')
    words = ''.join((c.lower() if c.isalpha() else ' ') for c in text).split()

    for word in words:
        if not word in word2freq:
            word2freq[word] = 0
        word2freq[word] += 1


if __name__ == '__main__':
    if len(sys.argv) > 2:
        maxPages = int(sys.argv[2])
    if len(sys.argv) > 3:
        minFreq = int(sys.argv[3])
    parser.parseWithCallback(sys.argv[1], printPage)
Ejemplo n.º 5
0
    #Find all of the links. If the page is in the links, print it
    links = linksPattern.findall(content.text)
    for page in pages:
        if page['title'] in links:
            page['referenceCount'] += 1


if __name__ == "__main__":
    with codecs.open('out.txt', encoding='utf-8') as pagesFile:
        pages = json.load(pagesFile)
        print 'Done loading json'
        print 'loaded %d pages' % len(pages)

    for page in pages:
        page['referenceCount'] = 0

    try:
        start = time.time()
        page_parser.parseWithCallback(sys.argv[1], contentHandler)
    except KeyboardInterrupt:
        pass
    finally:
        with codecs.open('eventswithreferences.json', 'w',
                         encoding='utf-8') as outFile:
            try:
                for page in pages:
                    print page['title'], page['referenceCount']
                outFile.write(json.dumps(pages))
                print "left off at: ", lastContentId
            except:
                pdb.set_trace()
Ejemplo n.º 6
0
    for word, freq in word2freq.iteritems():
        if freq > minFreq:
            f.write('%s, %d\n' % (word, freq))
    f.close()

def printPage(page):
    global numPages

    numPages += 1
    if numPages % 100 == 0:
        print "parsed %d" % numPages
        dumpWordFreqs()
    #TEMP
    if maxPages > 0 and numPages >= maxPages:
        raise Exception("done")
    text = page.text.encode('utf_8')
    words = ''.join( (c.lower() if c.isalpha() else ' ') for c in text ).split()

    for word in words:
        if not word in word2freq:
            word2freq[word] = 0
        word2freq[word] += 1

if __name__ == '__main__':
    if len(sys.argv) > 2:
        maxPages = int(sys.argv[2])
    if len(sys.argv) > 3:
        minFreq = int(sys.argv[3])
    parser.parseWithCallback(sys.argv[1], printPage)

Ejemplo n.º 7
0
 def data_gathering(self):
     page_parser.parseWithCallback(
         "hewikisource-20120628-pages-articles.xml", processPage)
Ejemplo n.º 8
0

def processPage(page):
    if page.title.find('קטגוריה') != 0:
        print "----- " + page.title + " ----"
        print page.text


#    if page.title.find('תבנית') ==0:
#        print "----- "+page.title+" ----"
#        print page.text
#    print category_links(page.text)
#    print "-------------------------------------------------------------------------"


class DataGathering(object):
    def __init__(self, N=200, M=2):
        self.articals = []
        self.data_gathering()

    def data_gathering(self):
        page_parser.parseWithCallback(
            "hewikisource-20120628-pages-articles.xml", processPage)

    def _getArticals(self):
        return self.articals


page_parser.parseWithCallback("hewikisource-20120628-pages-articles.xml",
                              processPage)
Ejemplo n.º 9
0
		print content.id, time.strftime('%H:%M:%S', time.gmtime(time.time() - start))

	#Find all of the links. If the page is in the links, print it
	links = linksPattern.findall(content.text)
	for page in pages:
		if page['title'] in links:
			page['referenceCount']+=1

if __name__ == "__main__":
	with codecs.open('out.txt',encoding='utf-8') as pagesFile:
		pages = json.load(pagesFile)
		print 'Done loading json'
		print 'loaded %d pages'%len(pages)

	for page in pages:
		page['referenceCount'] = 0

	try:
		start = time.time()
		page_parser.parseWithCallback(sys.argv[1], contentHandler)
	except KeyboardInterrupt:
		pass
	finally:
		with codecs.open('eventswithreferences.json','w', encoding='utf-8') as outFile:
			try:
				for page in pages:
					print page['title'],page['referenceCount']
				outFile.write(json.dumps(pages))
				print "left off at: ",lastContentId
			except:
				pdb.set_trace()
Ejemplo n.º 10
0
 def data_gathering(self): 
     page_parser.parseWithCallback("hewikisource-20120628-pages-articles.xml", processPage)
Ejemplo n.º 11
0
#def category_links(text):
#    if re.match(r"INSERT INTO `categorylinks` VALUES", text):
#        return re.findall(r"\((\d+),'([^']+)'(?:,'[^']*'){5}\)", text)


    
def processPage(page):
    if page.title.find('קטגוריה') != 0: 
        print "----- "+page.title+" ----"
        print page.text
#    if page.title.find('תבנית') ==0: 
#        print "----- "+page.title+" ----"
#        print page.text
#    print category_links(page.text)
#    print "-------------------------------------------------------------------------"
 
class DataGathering(object):
    
    def __init__(self,N = 200, M = 2):
        self.articals = []
        self.data_gathering()     
     
    def data_gathering(self): 
        page_parser.parseWithCallback("hewikisource-20120628-pages-articles.xml", processPage)

    def _getArticals(self):
        return self.articals

page_parser.parseWithCallback("hewikisource-20120628-pages-articles.xml", processPage)

                pass

def processPageForCoords(page):
    page.coords = []
    for s in coordsPattern.findall(page.text):
        try:
            page.coords.append(Coords(s))
        except (ValueError):
            dataSource.saveInvalidCoordPage(page, s)

def processPage(page):
    """
    We're interested in pages representing years with event descriptions,
    and those which mention any sort of geographic coordinates.
    """
    if isYearPattern.match(page.title):
        processYear(page)
        for event in page.events:
            dataSource.saveEvent(event)
    else:
        processPageForCoords(page)
        if page.coords:
            dataSource.savePage(page)

if __name__ == "__main__":
    dataSource = DataSource()
    page_parser.parseWithCallback(sys.argv[1], processPage)
    print
    print "Done parsing: ", sys.argv[1]
    outputs = (DataSource.EVENTS_SAVED, DataSource.COORD_PAGES, DataSource.INVALID_COORD_PAGES)
    print "Events: %d; Pages with coords: %d; Invalid coords: %d" % outputs
Ejemplo n.º 13
0
    page.coords = []
    for s in coordsPattern.findall(page.text):
        try:
            page.coords.append(Coords(s))
        except (ValueError):
            dataSource.saveInvalidCoordPage(page, s)


def processPage(page):
    """
    We're interested in pages representing years with event descriptions,
    and those which mention any sort of geographic coordinates.
    """
    if isYearPattern.match(page.title):
        processYear(page)
        for event in page.events:
            dataSource.saveEvent(event)
    else:
        processPageForCoords(page)
        if page.coords:
            dataSource.savePage(page)


if __name__ == "__main__":
    dataSource = DataSource()
    page_parser.parseWithCallback(sys.argv[1], processPage)
    print
    print "Done parsing: ", sys.argv[1]
    outputs = (DataSource.EVENTS_SAVED, DataSource.COORD_PAGES,
               DataSource.INVALID_COORD_PAGES)
    print "Events: %d; Pages with coords: %d; Invalid coords: %d" % outputs