コード例 #1
0
ファイル: api.py プロジェクト: aprajita16/readme-analysis
 def __init__(self, *args, **kwargs):
     session = cache.get_session(timeout=1.5)
     session.params['api_key'] = libraries_io_api_key
     super(self.__class__, self).__init__('https://libraries.io/api',
                                          session=session,
                                          *args,
                                          **kwargs)
コード例 #2
0
ファイル: api.py プロジェクト: aprajita16/readme-analysis
 def __init__(self, *args, **kwargs):
     super(self.__class__,
           self).__init__('https://api.github.com',
                          auth=(gh_username, gh_password),
                          session=cache.get_session(timeout=1.0),
                          *args,
                          **kwargs)
コード例 #3
0
ファイル: api.py プロジェクト: andrewhead/StackSkim
 def __init__(self, *args, **kwargs):
     super(self.__class__, self).__init__(
         'https://api.github.com',
         auth=(gh_username, gh_password),
         session=cache.get_session(timeout=1.0),
         *args, **kwargs
     )
コード例 #4
0
ファイル: api.py プロジェクト: andrewhead/StackSkim
 def __init__(self, *args, **kwargs):
     session = cache.get_session(timeout=1.5)
     session.params['api_key'] = libraries_io_api_key
     super(self.__class__, self).__init__(
         'https://libraries.io/api',
         session=session,
         *args, **kwargs
     )
コード例 #5
0
def get_records(query):
    """
    Fetch records for a Craigslist search for a query
    """
    # Get the latest page in our neighborhood
    resp = cache.get_session().get("http://sfbay.craigslist.org/search/eby/sss", params={
        'query': urllib.quote_plus(query),
    })
    soup = Soup(resp.text, "html.parser")

    # Go through each of the search results
    records = []
    for row in soup.select('li.result-row'):

        # Get the date of the post
        timestamp = row.find('time')['datetime']
        date_time = datetime.fromtimestamp(mktime(
            strptime(timestamp, "%Y-%m-%d %H:%M")
        ))

        # ... link ....
        link = row.find('a', attrs={'class': 'hdrlnk'})
        href = link['href']
        desc = link.text

        # ... item price ...
        prices = row.select('.result-price')
        price = "?"
        if prices:
            price = prices[0].text.replace('$', '')

        # ... where the item's at ...
        neighborhoods = row.select('.result-hood')
        neighborhood = "?"
        if neighborhoods:
            neighborhood = re.search(r'\((.*)\)', neighborhoods[0].text).group(1)

        # Only save the record if it's from the last N days
        if abs((date_time - datetime.now()).days) <= 2:
            records.append([
                '%d/%d' % (date_time.month, date_time.day), desc[:50] + "...",
                price, neighborhood, href,
            ])

    return records
コード例 #6
0
ファイル: api.py プロジェクト: aprajita16/readme-analysis
#! /usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals
import logging
import ConfigParser
from requests.auth import AuthBase
import slumber
import os.path
import cache

logging.basicConfig(level=logging.INFO, format="%(message)s")
default_requests_session = cache.get_session(timeout=1)
default_requests_session.headers['User-Agent'] =\
    "Andrew Head (for academic analysis) <[email protected], Austin Le (for academic" +\
    " analysis) <*****@*****.**>"

lib_config = ConfigParser.ConfigParser()
lib_config.read(os.path.expanduser(os.path.join('~', '.libraries_config')))
libraries_io_api_key = lib_config.get('api', 'API_KEY')


class LibrariesIoAuth(AuthBase):
    def __init__(self, api_key):
        self.api_key = api_key

    def __call__(self, request):
        request.data['api_key'] = self.api_key
        return request

コード例 #7
0
ファイル: so_tags.py プロジェクト: andrewhead/StackSkim
#! /usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals
import logging
import argparse
import cache
import peewee
from models import Bigram, Trigram, create_tables


logging.basicConfig(level=logging.INFO, format="%(message)s")
session = cache.get_session()
TARGET_TAGS = ['wget', 'regex', 'css-selectors', 'jquery']


def make_url(tags):
    tag_string = ';'.join(tags)
    return 'https://api.stackexchange.com/2.2/tags/{tags}/related'.format(tags=tag_string)


def fetch_bigrams():
    for tag in TARGET_TAGS:
        resp = session.get(make_url([tag]), params={
            'pagesize': 100,
            'site': 'stackoverflow',
        })
        respJson = resp.json()
        for i in respJson['items']:
            bg, _ = Bigram.get_or_create(tag1=tag, tag2=i['name'])
            bg.count = i['count']
コード例 #8
0
ファイル: so_extract.py プロジェクト: pombreda/StackSkim
            for tag in soup.children:
                if tag.name == 'pre':
                    rfile.write(tag.text + "\n")
                    rfile.write("--------------\n")


def question_lines_to_file(results):

    with codecs.open(QUESTION_LINES, 'w', encoding='utf8') as lfile:
        for r in results:
            soup = Soup(r['body'])
            for tag in soup.children:
                if tag.name == 'pre':
                    text = re.sub(r'\\s*\n', '',
                                  tag.text)  # remove line continuations
                    lines = text.split('\n')
                    for line in lines:
                        if re.match("^(.*\$)?\s*wget", line):
                            line = re.sub("^.*\$\s*", "", line)
                            lfile.write(line.strip() + '\n')


if __name__ == '__main__':
    session = cache.get_session(timeout=1.0)
    questions = fetch_questions(session, 'wget')
    answers = fetch_answers(session, questions)
    answer_results_to_file(answers)
    answer_lines_to_file(answers)
    question_results_to_file(questions)
    question_lines_to_file(questions)
コード例 #9
0
ファイル: so_extract.py プロジェクト: andrewhead/StackSkim
            for tag in soup.children:
                if tag.name == 'pre':
                    rfile.write(tag.text + "\n")
                    rfile.write("--------------\n")


def question_lines_to_file(results):

    with codecs.open(QUESTION_LINES, 'w', encoding='utf8') as lfile:
        for r in results:
            soup = Soup(r['body'])
            for tag in soup.children:
                if tag.name == 'pre':
                    text = re.sub(r'\\s*\n', '', tag.text)  # remove line continuations
                    lines = text.split('\n')
                    for line in lines:
                        if re.match("^(.*\$)?\s*wget", line):
                            line = re.sub("^.*\$\s*", "", line)
                            lfile.write(line.strip() + '\n')


if __name__ == '__main__':
    session = cache.get_session(timeout=1.0)
    questions = fetch_questions(session, 'wget')
    answers = fetch_answers(session, questions)
    answer_results_to_file(answers)
    answer_lines_to_file(answers)
    question_results_to_file(questions)
    question_lines_to_file(questions)

コード例 #10
0
ファイル: get_snippets.py プロジェクト: andrewhead/StackSkim
import tokenize
import re
from StringIO import StringIO
import argparse
from progressbar import ProgressBar, Percentage, Bar, RotatingMarker, ETA, Counter

import models
from models import Page, Snippet, Token, Comment, SnippetComment, SnippetToken
from sites import SITES


logging.basicConfig(level=logging.INFO, format="%(message)s")
HEADER_TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
TEXT_TAGS = ['p', 'div']
NONTEXT_TAGS = ['script']
requests_session = cache.get_session()


def extract_code(node):
    code_text = node.text
    code_text = re.sub('^>>> ', '', code_text, flags=re.MULTILINE)
    code_text = re.sub('^\.\.\. ', '', code_text, flags=re.MULTILINE)
    return code_text


def is_text(text):
    # The heuristic we use here is that we only consider a string to be a textual description
    # if it contains at least two consecutive alphabetic letters
    if not re.match('[A-Za-z]{2}', text):
        return False
    return True
コード例 #11
0
ファイル: api.py プロジェクト: andrewhead/StackSkim
#! /usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals
import logging
import ConfigParser
import slumber
import os.path
import cache


logging.basicConfig(level=logging.INFO, format="%(message)s")
default_requests_session = cache.get_session(timeout=1)
default_requests_session.headers['User-Agent'] =\
    "Austin Le (for academic analysis) <*****@*****.**>"

gh_config = ConfigParser.ConfigParser()
gh_config.read(os.path.expanduser(os.path.join('~', '.github', 'github.cfg')))
gh_username = gh_config.get('auth', 'username')
gh_password = gh_config.get('auth', 'password')


class Github(slumber.API):
    def __init__(self, *args, **kwargs):
        super(self.__class__, self).__init__(
            'https://api.github.com',
            auth=(gh_username, gh_password),
            session=cache.get_session(timeout=1.0),
            *args, **kwargs
        )