Beispiel #1
0
 def getSelected(self):
     #Get selected wikipedia articles
     inHandle1 = open(r'/home/lhy/ESA/wikiprep-esa/selected.txt')
     lines = inHandle1.readlines()
     inHandle2 = open(r'/home/lhy/ESA/text/20051105_pages_articles.hgw.xml')
     selectedID = set([])
     selectedArticles = []
     self.linkNum = []
     self.idList = []
     self.inLinkDict = {}
     for i in range(len(lines)):
         selectedID.add(int(lines[i].strip('\n')))
     for _id in selectedID:
         self.inLinkDict[_id] = 0
     for doc in xmlwikiprep.read(inHandle2):
         page_id = int(doc["_id"])
         for link in doc["links"]:
             linkID = int(link)
             if linkID in selectedID:
                 self.inLinkDict[linkID] += 1
         if page_id not in selectedID:
             continue
         #for link in doc["links"]:
         #    linkID = int(link)
         #    if linkID in selectedID:
         #        self.inLinkDict[linkID] += 1.0
         self.idList.append(page_id)
         title = doc["title"]
         title = html.fromstring(title).text_content().lower()
         text = doc["text"]
         text = html.fromstring(text).text_content().lower()
         mergeList = [title]
         mergeList.append(text)
         mergeText = ' '.join(mergeList)
         selectedArticles.append(mergeText)
         #self.linkNum.append(len(doc["links"]))
     self.text = selectedArticles
     words = self.getWordSim()
     #counter = wordCounter.WordCounter(words,self.text)
     #counter.tfidf(20)
     print "SelectedArticles ok"
Beispiel #2
0
 def getSelected(self):
     #Get selected wikipedia articles
     inHandle1 = open(r'/home/lhy/ESA/wikiprep-esa/selected.txt')
     lines = inHandle1.readlines()
     inHandle2 = open(r'/home/lhy/ESA/text/20051105_pages_articles.hgw.xml')
     selectedID = set([])
     selectedArticles = []
     self.linkNum = []
     self.idList = []
     self.inLinkDict = {}
     for i in range(len(lines)):
         selectedID.add(int(lines[i].strip('\n')))
     for _id in selectedID:
         self.inLinkDict[_id] = 0
     for doc in xmlwikiprep.read(inHandle2):
         page_id = int(doc["_id"])
         for link in doc["links"]:
             linkID = int(link)
             if linkID in selectedID:
                 self.inLinkDict[linkID] += 1
         if page_id not in selectedID:
             continue
         #for link in doc["links"]:
         #    linkID = int(link)
         #    if linkID in selectedID:
         #        self.inLinkDict[linkID] += 1.0
         self.idList.append(page_id)
         title = doc["title"]
         title = html.fromstring(title).text_content().lower()
         text = doc["text"]
         text = html.fromstring(text).text_content().lower()
         mergeList = [title]
         mergeList.append(text)
         mergeText = ' '.join(mergeList)
         selectedArticles.append(mergeText)
         #self.linkNum.append(len(doc["links"]))
     self.text = selectedArticles
     words = self.getWordSim()
     #counter = wordCounter.WordCounter(words,self.text)
     #counter.tfidf(20)
     print "SelectedArticles ok"
Beispiel #3
0
                        """, linkBuffer)

            linkBuffer = []
            linkBuflen = 0

    return


args = sys.argv[1:]
# scanData.py <hgw_file>

if len(args) < 1:
    sys.exit()

f = open(args[0], 'r')
for doc in xmlwikiprep.read(f):
    recordArticle(doc)
f.close()

if nsBuflen > 0:
    cursor.executemany(
        """
	INSERT INTO namespace (id)
        VALUES (%s)
        """, nsBuffer)

    nsBuffer = []
    nsBuflen = 0

if linkBuflen > 0:
    cursor.executemany(
Beispiel #4
0
                cursor.executemany("""INSERT INTO pagelinks (source_id,target_id) VALUES (%s,%s)""", linkBuffer)
                linkBuffer = []

    return

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print "scanLinks.py file1.gz file2.gz ... > links.txt"
        sys.exit(1)

    for fname in sys.argv[1:]:
        print >>sys.stderr, "  -> Processing file", fname
        #f = Popen(['zcat', fname], stdout=PIPE) # much faster than python gzip
        f = Popen(['pigz', '-d', '-c', fname], stdout=PIPE) # even faster

        for doc in xmlwikiprep.read(f.stdout, set(['text'])):
            recordArticle(doc)

    if nsBuffer:
        cursor.executemany("""
        INSERT INTO namespace (id)
            VALUES (%s)
            """, nsBuffer)

    if linkBuffer:
        cursor.executemany("""
        INSERT INTO pagelinks (source_id,target_id)
            VALUES (%s,%s)
            """, linkBuffer)

    print >>sys.stderr, "Some db mangling.."
Beispiel #5
0
   cs = []
   for c in cats:
	if catDict.has_key(c):
		catDict[c].add(curId)
	else:
		catDict[c] = set([curId])

   return


# scanCatHier.py <hgw/gum.xml> --stopcats=<category list file>

f = open(args[0],'r')

for doc in xmlwikiprep.read(f):
	recordArticle(doc)

f.close()

print 'cat_hier output complete'
print 'traversing category tree..'

cats = set(STOP_CATS)
outcats = set(STOP_CATS)

while cats:
	parent = cats.pop()

	childs = []
	if catDict.has_key(parent):
Beispiel #6
0
            INSERT INTO article (id,title)
            VALUES (%s,%s)
            """, articleBuffer)
        cursor.executemany("""
            INSERT INTO text (old_id,old_text)
            VALUES (%s,%s)
            """, textBuffer)
        articleBuffer = []
        textBuffer = []

for fname in args:
    print >>sys.stderr, "  -> Processing file", fname
    #f = Popen(['zcat', fname], stdout=PIPE) # much faster than python gzip
    f = Popen(['pigz', '-d', '-c', fname], stdout=PIPE) # even faster

    for doc in xmlwikiprep.read(f.stdout):
        recordArticle(doc)

# f = open(hgwpath, 'r')
# for doc in xmlwikiprep.read(f):
#     recordArticle(doc)
# f.close()

    if articleBuffer:
        cursor.executemany("""
            INSERT INTO article (id,title)
            VALUES (%s,%s)
            """, articleBuffer)
        cursor.executemany("""
            INSERT INTO text (old_id,old_text)
            VALUES (%s,%s)