Ejemplo n.º 1
0
    def loadData(self):
        from wikitools import wiki
        from wikitools import category

        wikiobj = wiki.Wiki("https://en.wikipedia.org/w/api.php")
        wikicat = category.Category(wikiobj, title="2016_films")
        self.wikipages = wikicat.getAllMembers()
Ejemplo n.º 2
0
    def promote(self,
                title=False,
                removePrefix=False,
                force=False,
                reason=False,
                watch=False,
                unwatch=False):
        ''' Promote a page to a category

		newtitle - title of the category
		removePrefix - when using the page title as the category title remove the prefix
		force - in case the category already exists, delete it
		'''
        if not self.exists:
            print self.getWikiText()
            raise page.NoPage
        if self.title.startswith("Category:"):
            raise NotPromotable
        if not title:
            title = self.title
            if removePrefix:
                title = re.sub(".*:", "", title)
            title = "Category:" + title
        cat = category.Category(self.site, title)
        if cat.exists and not force:
            raise AlreadyExists
        cat.edit(text=self.getWikiText())
        self.rewriteReferences(self.getBacklinks(), title)
        self.delete(reason=reason, watch=watch, unwatch=unwatch)
        return cat
Ejemplo n.º 3
0
    def get_category_recursively(self, category_title, max_articles_num=None):
        """
        Iterative BFS on the category tree
        returns all articles found in the run, as wiki Page objects

        :param category_title: title of needed category
        :param max_articles_num: maximum number of articles to fetch. stops after reaching the limit
                                 'None' means without limit.
        :return:
        """

        closed_categories = set()
        open_categories = [category_title]
        articles = set()

        while open_categories:

            current_category_name = open_categories.pop()
            if current_category_name in closed_categories: continue
            current_category = category.Category(self.site, current_category_name)

            for d in self.get_category_articles(current_category):
                if self.is_category(d.title):
                    open_categories.append(d.title)
                else:
                    articles.add(self.attach_metadata(d))
                    # quit if maximum_articles_num reached
                    if max_articles_num is not None and len(articles) >= max_articles_num:
                        return articles
            closed_categories.add(current_category)

        return articles
Ejemplo n.º 4
0
 def get_category(self, catname):
     cat = category.Category(self.site, catname)
     items = []
     for article in cat.getAllMembersGen(namespaces=[0]):
         items.append(article.title.lower())
         if len(items) % 1000 == 0:
             print 'Downloading item %5d : %20s' % (len(items), items[-1])
     return items
Ejemplo n.º 5
0
def get_count(district):

	cat = category.Category(site, district.strip("\n") + " மாவட்ட ஆசிரியர்கள் தொடங்கிய கட்டுரைகள்")

	counter = 0
	for article in cat.getAllMembersGen():
		counter = counter +1

	return counter
Ejemplo n.º 6
0
def get_snpedia_snp_names():

    site = wiki.Wiki('http://bots.snpedia.com/api.php')
    snps = category.Category(site, 'Is_a_snp')
    snpedia = set()

    for article in snps.getAllMembersGen(namespaces=[0]):
        snpedia.add(article.title.lower())

    return snpedia
Ejemplo n.º 7
0
 def test_getAllMembers(self):
     c = category.Category(self.site, "Test pages")
     api.logging = True
     members = c.getAllMembers()
     self.assertIsInstance(members[0], page.Page)
     log = api.querylog.pop()
     self.assertNotIn("cmnamespace", log)
     members = c.getAllMembers(namespaces=[3, 5])
     self.assertEqual(len(members), 0)
     log = api.querylog.pop()
     self.assertIn("cmnamespace", log)
Ejemplo n.º 8
0
def get_drugs(fname):
    site = wiki.Wiki("http://bots.snpedia.com/api.php")
    drugs = category.Category(site, "Is_a_medicine")
    n = 0

    with open(fname, 'w') as f:
        for article in drugs.getAllMembersGen(namespaces=[0]):
            drug = _normalize_str(article.title.strip())
            f.write(drug + '\n')
            n += 1

    print 'drugs extracted:', n
Ejemplo n.º 9
0
def snpedia_getter():
    site = wiki.Wiki("http://snpedia.com/api.php")  # open snpedia
    snps = category.Category(site, "Is_a_snp")
    snpedia = {}

    for article in snps.getAllMembersGen(namespaces=[0]):  # get all snp-names
        snpedia[article.title.lower()] = "in snpedia"
        print article.title

    snpedia_outfile = open("snpedia.data", "wb")  # save all snps to cache
    pickle.dump(snpedia, snpedia_outfile)
    snpedia_outfile.close()
    return snpedia
Ejemplo n.º 10
0
    def get_articles(self, lang, update=False):

        if update==True:
            for r in self.con.execute('select distinct category from ' + lang + '_wikipages where category not in (select ' + lang + ' from categories)'):
                self.con.execute('delete from ' + lang + '_wikipages where category=?', (str(r[0]),))
                print('Deleting category', r[0], '...')
        else:
            self.con.execute('delete from ' + lang + '_wikipages')
        self.con.commit()
        wikipedia.set_lang(lang)
        wikisite = 'http://' + lang + '.wikipedia.org/w/api.php'

        wikiObject = w.Wiki(wikisite)
        cats = self.con.execute('select ' + lang + ' from categories').fetchall()
        for cat in cats:
            print('Checking category:', cat[0])
            if lang == 'ru':
                wikiCategory = c.Category(wikiObject, title='Категория:' + cat[0])
            elif lang == 'en':
                wikiCategory = c.Category(wikiObject, title='Category:' + cat[0])
            else:
                break
            articles = wikiCategory.getAllMembers(namespaces=[0])
            if len(articles) > 200:
                articles = articles[0:200]
            for article in articles:
                try:
                    if self.is_indexed(article.title, lang):
                        continue
                    print('Loading article', article.title, '...')
                    new_article = wikipedia.page(article.title)
                    if len(new_article.content) == 0:
                        continue
                    self.con.execute('insert into ' + lang + '_wikipages(name, content, category) values(?, ?, ?)', (article.title, new_article.content, cat[0]))
                except:
                    continue
            self.con.commit()
Ejemplo n.º 11
0
    def run(self):
        cat = category.Category(self.wiki, self.categoryname)
        self.overviewpage = page.Page(self.wiki, u"VEIDs")

        self.veidlist = {}
        for article in cat.getAllMembersGen(namespaces=[0]):
            self.collect_page_detail(article)

        try:
            oldtext = self.overviewpage.getWikiText()
        except page.NoPage:
            oldtext = ""

        newtext = self.build_new_overviewpage_text()
        # only save if something was changed
        if newtext == oldtext: return

        self.overviewpage.edit(text=newtext,
                               skipmd5=True,
                               bot=True,
                               summary=u"Regenerated list.")
Ejemplo n.º 12
0
logging.info("Checking for bot access rights")
bot_flag = check_for_bot(wiki_username)

if bot_flag:
    logging.info("The user " + wiki_username + " has bot access.")
else:
    logging.info("The user " + wiki_username + " does not have bot access")

commons_url = "https://commons.wikimedia.org/w/api.php"

commons = wikitools.wiki.Wiki(commons_url)

counter = 1

cat = category.Category(commons, "PDF files in Tamil with OCR conversion")
# iterate through all the pages in ns 0
for pdf in cat.getAllMembersGen(namespaces=[6]):
    print str(counter) + ".    " + pdf.title.encode('utf-8')
    pdf_name = pdf.title.encode('utf-8').split("File:")[1]
    #pdf_name = "சிந்தனைப் பந்தாட்டம்.pdf"
    index_page = wikitools.Page(wiki, "Index:" + pdf_name, followRedir=True)
    edit_summary = "Index creation"

    content = " "

    #if index_page.exists:
    #      print index_page.getWikiText()
    #    logger.info("page already there")
    #else:
Ejemplo n.º 13
0
 def wikiupdate(self, title, url):
     cat = category.Category(self.site, "Linklist")
     for article in cat.getAllMembersGen(namespaces=[0]):
         print article.edit(appendtext="\n* {title} - {url} \n".format(
             title=title, url=url))
Ejemplo n.º 14
0
 def get_snps(self):
     """generator which returns snps from snpedia"""
     snps = category.Category(self.site, "Is_a_snp")
     for article in snps.getAllMembersGen(
             namespaces=[0]):  # get all snp-names as list and print them
         yield article.title.lower()
Ejemplo n.º 15
0
                    else:
                        s[compress(k)] = v
                        field_lookup[compress(k)] = 1
                print json.dumps(s)
                scores.append(s)
    return (scores, files)


db.execute(
    '''create table if not exists score (id text, category text, scoretype text)'''
)
db.execute(
    '''create table if not exists scorefile (file_n text, score text)''')

catname = "Category:Mozart,_Wolfgang_Amadeus"
cat = category.Category(site, catname)
(scores, files) = getCat(cat)

# Make sure the database has the right columns available..
add_dbfields(u"score", field_lookup.keys())
add_dbfields(u"scorefile", file_field_lookup.keys())

c = db.cursor()
for score in scores:
    c.execute('''select count(*) from score where id = ?''', (score['id'], ))
    (n, ) = c.fetchone()
    if n == 0:
        columns = map(to_unicode, score.keys())
        values = map((lambda column: to_unicode(score[column])), columns)
        query = "insert into score (" + ", ".join(columns) + ") values (" + (
            ", ".join(["?"] * len(columns))) + ")"
Ejemplo n.º 16
0
"""
Carmi Rothberg
Assignment 2: Building a Corpus
"""

### IMPORTS ###
import re, json
import parsing, extract
from wikitools import wiki, category

### GET WIKI PAGES ###
print('importing pages...')
wikiobj = wiki.Wiki("https://en.wikipedia.org/w/api.php")
wikicat = category.Category(wikiobj, title = "2017_films")
wikipages = wikicat.getAllMembers()
print('pages imported...')

### EXTRACT INFORMATION FROM A PAGE ###
def page2dict(page):
    ### SETUP ###
    d = {'title': None, 'director': None, 'producer': None, 'starring': None, 'runtime': None, 'country': None, 'language': None, 'time': None, 'location': None, 'text': None}
    ### PRELIMINARY DATA ###
    title = re.sub(r'\s*\((2017)?\s*film\)\s*', '', str(page.title.encode('utf8'))) #guess film title from page title
    #print('\t'+title)
    d['title'] = title
    categories = [c[9:] for c in page.getCategories()]
    d['categories'] = categories
    ### DATA FROM PAGE TEXT ###
    pagetext = parsing.remove_html(page.getWikiText()) #clean page text
    # get as much as possible from infobox #
    extract.infobox(pagetext, d)
Ejemplo n.º 17
0
#coding=utf-8
from wikitools import wiki, category
import sys

#category:
#   Is_a_medical_condition
#   Is_a_gene
#   Is_a_genoset
#   Is_a_medicine
#   Is_a_medical_condition
#   Topic
c = sys.argv[1] 
site = wiki.Wiki("http://bots.snpedia.com/api.php")
snps = category.Category(site, c)
for article in snps.getAllMembersGen(namespaces=[0]):
    print article.title.encode('u8')
Ejemplo n.º 18
0
            zipfile.ZipFile("snpedia-archive.zip", mode="a",
                            compression=zipfile.ZIP_DEFLATED) as ziparchive:
        site = wiki.Wiki("http://bots.snpedia.com/api.php")
        snpsfile = csv.DictReader(
            genomefile,
            delimiter="\t",
            fieldnames=["rsid", "chromosome", "position", "genotype"])

        if "snpedia_rsids" in ziparchive.namelist():
            snpedia_rsids = {
                line.rstrip()
                for line in ziparchive.read("snpedia_rsids")
            }
        else:
            puts("Get list of SNPs on SNPedia ... ")
            snps = category.Category(site, "Is_a_snp")
            snpedia_rsids = {
                article.title.lower()
                for article in snps.getAllMembersGen(namespaces=[0])
            }
            ziparchive.writestr("snpedia_rsids",
                                "\n".join(sorted(snpedia_rsids)))
            puts("done\n")

        try:
            with open("snpedia-archive.json", "r") as snpinfofile:
                snpinfo = json.load(snpinfofile)
        except (IOError, ValueError):
            snpinfo = {}

        namelist = set(ziparchive.namelist())
Ejemplo n.º 19
0
    plt.barh(y_pos, z, align='center', alpha=0.4)
    plt.yticks(y_pos, hot_tokens)
    plt.xlabel('Average number of occurrences per article')
    plt.title('Token distribution')
    plt.show()


site = wiki.Wiki("https://en.wikipedia.org/w/api.php")
# Select a category with a reasonable number of articles (>100)
#cat = "Culture"
cat = "Games"
print cat

print "Loading category data. This may take a while..."
cat_data = category.Category(site, cat)

corpus_titles = []
corpus_text = []

for n, page in enumerate(cat_data.getAllMembersGen()):
    print "\r Loading article {0}".format(n + 1),
    corpus_titles.append(page.title)
    corpus_text.append(page.getWikiText())

n_art = len(corpus_titles)
print "\nLoaded " + str(n_art) + " articles from category " + cat

corpus_tokens = []
corpus_filtered = []
for n, art in enumerate(corpus_text):
Ejemplo n.º 20
0
from wikitools import wiki
from wikitools import page
from wikitools import category

# List of movie categories to be extracted
categories = [
    "American horror films", "American Western (genre) films",
    "American children's films"
]

site = wiki.Wiki("http://en.wikipedia.org/w/api.php")
porter = nltk.stem.porter.PorterStemmer()  #to be used in stemming below

for c in categories:
    i = 0
    cat = category.Category(site, title=c)
    pageList = cat.getAllMembersGen()  #Page generator
    for page in pageList:
        print i
        text = page.getWikiText()
        # Find the Plot section in the page
        plot = ""
        beg_i = text.find('==Plot')
        if beg_i == -1:
            continue
        plot = text[beg_i + 2:]
        end_i = plot.find('\n==')
        if end_i == -1:
            continue
        plot = plot[:end_i]
        # lowercasing, removing stopwords and stemming