Esempio n. 1
0
def get_samples(db, limit, offset=0):
    """
    Returns a combined list of negative and positive samples in a (text, label) format.

    Arguments:
    db (str) -- Name of the databse to use.
    limit (int) -- Amount of samples to retrieve.

    Keyword Arguments:
    offset (int) -- Where to start getting samples from.

    """
    conn = db_init(db=db)
    cursor = conn.cursor()

    sql =  "SELECT text, sentiment FROM item WHERE sentiment = ? LIMIT ? OFFSET ?"

    if limit < 2: limit = 2

    if limit > get_sample_limit(db):
        limit = get_sample_limit(db)

    if limit % 2 != 0:
        limit -= 1 #we want an even number

    limit = limit / 2
    offset = offset / 2

    cursor.execute(sql, ["negative", limit, offset])
    neg_samples = cursor.fetchall()

    cursor.execute(sql, ["positive", limit, offset])
    pos_samples = cursor.fetchall()

    return pos_samples + neg_samples
Esempio n. 2
0
def get_sample_limit(db):
    """
    Returns the limit of samples so that both positive and negative samples
    will remain balanced.

    Keyword Arguments:
    db (str) -- Name of the database to use.

    """

    #this is an expensive operation in case of a large database
    #therefore we store the limit in redis and use that when we can
    m = RedisManager()
    if 'limit' in m.r.keys():
        return int(m.r.get('limit'))

    db = db_init(db=db)
    cursor = db.cursor()
    cursor.execute("SELECT COUNT(*) FROM item where sentiment = 'positive'")
    pos_count = cursor.fetchone()[0]
    cursor.execute("SELECT COUNT(*) FROM item where sentiment = 'negative'")
    neg_count = cursor.fetchone()[0]
    if neg_count > pos_count:
        limit = pos_count
    else:
        limit = neg_count

    m.r.set('limit', limit)

    return limit
Esempio n. 3
0
def get_samples(db, limit, offset=0):
    """
    Returns a combined list of negative and positive samples in a (text, label) format.

    Arguments:
    db (str) -- Name of the databse to use.
    limit (int) -- Amount of samples to retrieve.

    Keyword Arguments:
    offset (int) -- Where to start getting samples from.

    """
    conn = db_init(db=db)
    cursor = conn.cursor()

    sql = "SELECT text, sentiment FROM item WHERE sentiment = ? LIMIT ? OFFSET ?"

    if limit < 2: limit = 2

    if limit > get_sample_limit(db):
        limit = get_sample_limit(db)

    if limit % 2 != 0:
        limit -= 1  #we want an even number

    limit = limit / 2
    offset = offset / 2

    cursor.execute(sql, ["negative", limit, offset])
    neg_samples = cursor.fetchall()

    cursor.execute(sql, ["positive", limit, offset])
    pos_samples = cursor.fetchall()

    return pos_samples + neg_samples
Esempio n. 4
0
def get_sample_limit(db):
    """
    Returns the limit of samples so that both positive and negative samples
    will remain balanced.

    Keyword Arguments:
    db (str) -- Name of the database to use.

    """

    #this is an expensive operation in case of a large database
    #therefore we store the limit in redis and use that when we can
    m = RedisManager()
    if 'limit' in m.r.keys():
        return int(m.r.get('limit'))

    db = db_init(db=db)
    cursor = db.cursor()
    cursor.execute("SELECT COUNT(*) FROM item where sentiment = 'positive'")
    pos_count = cursor.fetchone()[0]
    cursor.execute("SELECT COUNT(*) FROM item where sentiment = 'negative'")
    neg_count = cursor.fetchone()[0]
    if neg_count > pos_count:
        limit = pos_count
    else:
        limit = neg_count

    m.r.set('limit', limit)

    return limit
Esempio n. 5
0
def collect(db_name='', commit_every=1000, max_collect=400000, query_file=''):
    """
    Will continuously populate the sample database if it exists
    else it will create a new one.

    Keyword Arguments:
    db_name (str) -- Custom name for database.
    commit_every (int) -- Commit to sqlite after commit_every executes.
    max_collect (int) -- Will stop collecting at this number.
    query_file (str) -- If query file is provided should be absolute path to text file.
    """

    if not db_name:
        d = datetime.datetime.now()
        #if no dbname is provided we'll store a timestamped db name
        db_name = "samples-%s-%s-%s.db" % (d.year, d.month, d.day)

    db = db_init(db=db_name)
    cursor = db.cursor()

    queries = {}

    if query_file:
        if not os.path.exists(query_file):
            return "Query file path does not exist."

        f = open(query_file)
        words = [line.strip() for line in f.readlines()]
        label = words[0]
        for w in words:
            queries[w] = label

    else:
        queries[':)'] =  'positive'
        queries[':('] =  'negative'

    #collect on twitter with kral
    g = stream(query_list=queries.keys(), service_list="twitter")

    c = 0
    for item in g:

        text = unicode(item['text'])

        sentiment = queries.get(item['query'], None)

        if sentiment:
            try:
                cursor.execute('INSERT INTO item VALUES (NULL,?,?)', [text, sentiment])
                c += 1
                if c % commit_every == 0:
                    db.commit()
                    print("Commited {}".format(commit_every))
                if c == max_collect:
                    break
            except IntegrityError: #skip duplicates
                continue

    db.close()
Esempio n. 6
0
def twitter_feed(sentiment, last_id=None, new=False):
    """
    Will return positive and negative tweets from the twitter api.

    What is flagged as positive and negative iscurrently determined
    by happy/sad faces.
    """

    db = db_init()
    cursor = db.cursor()

    if sentiment == 'positive':
        query = ':)'
    elif sentiment == 'negative':
        query = ':('
    else:
        print('Sentiment must be either positive or negative.')
        return

    last_id_url = "http://search.twitter.com/search.json?lang=en&q=%s&since_id=%s"
    query_url   = "http://search.twitter.com/search.json?lang=en&q=%s"

    if not (last_id or new):
        cursor.execute('SELECT item_id FROM item WHERE sentiment=? ORDER BY item_id DESC LIMIT 1', [sentiment])
        last_id = cursor.fetchone()[0]
        url = last_id_url  % (query, last_id)
    elif last_id:
        url = last_id_url  % (query, last_id)
    elif new:
        url = query_url % query

    data = []

    try:
        data = json.loads(urllib2.urlopen(url).read())
    except:
        raise

    if data:

        items = data['results']
        cursor.execute('SELECT COUNT() FROM item WHERE sentiment = ?',[sentiment])
        total_rows = cursor.fetchone()[0]

        for item in items:

            text = unicode(item['text']) #force unicode for db

            if text:

                item_id = item['id']

                try:
                    cursor.execute('INSERT INTO item VALUES (NULL,?,?,?)', [item_id, text, sentiment])
                    last_id = item_id
                    print sentiment, total_rows, text
                    total_rows += 1
                except sqlite3.IntegrityError, e:
                    pass #these are duplicates, we don't want duplicates
Esempio n. 7
0
def fetch(verbose=True):
    """
    Pre-populates training database from public archive of ~2mil tweets
    """

    if not verbose:
        logger.setLevel(0)

    response = urllib2.urlopen('https://github.com/downloads/Tawlk/synt/sample_data.bz2')

    total_bytes = int(response.info().getheader('Content-Length').strip())
    saved_bytes = 0
    start_time = time.time()
    last_seconds = 0
    last_seconds_start = 0
    data_buffer = StringIO()

    decompressor = bz2.BZ2Decompressor()

    if os.path.exists(settings.DB_FILE):
        os.remove(settings.DB_FILE)

    db = db_init(create=False)
    db.set_progress_handler(import_progress,20)

    while True:
        seconds = (time.time() - start_time)
        chunk = response.read(8192)
        if not chunk:
            break
        saved_bytes += len(chunk)
        data_buffer.write(decompressor.decompress(chunk))
        if seconds > 1:
            percent = round((float(saved_bytes) / total_bytes)*100, 2)
            speed = round((float(total_bytes / seconds ) / 1024),2)
            speed_type = 'Kb/s'
            if speed > 1000:
                speed = round((float(total_bytes / seconds ) / 1048576),2)
                speed_type = 'Mb/s'
            if last_seconds >= 0.5:
                last_seconds = 0
                last_seconds_start = time.time()
                logger.info("Downloaded %d of %d Mb, %s%s (%0.2f%%)\r" % (saved_bytes/1048576, total_bytes/1048576, speed, speed_type, percent))
            else:
                last_seconds = (time.time() - last_seconds_start)
        if saved_bytes == total_bytes:
            logger.info("Downloaded %d of %d Mb, %s%s (100%%)\r" % (saved_bytes/1048576, total_bytes/1048576, speed, speed_type))
            try:
                db.executescript(data_buffer.getvalue())
            except Exception, e:
                logger.error("Sqlite3 import failed with: %s" % e)
                break
Esempio n. 8
0
def fetch(db_name="samples.db"):
    """
    Pre-populates training database from public archive of ~2mil tweets.
    Stores training database as db_name in ~/.synt/

    Keyword Arguments:
    db_name (str) -- Custom name for database.

    """

    response = urllib2.urlopen("https://github.com/downloads/Tawlk/synt/sample_data.bz2")

    total_bytes = int(response.info().getheader("Content-Length").strip())
    saved_bytes = 0
    start_time = time.time()
    last_seconds = 0
    last_seconds_start = 0
    data_buffer = StringIO()

    decompressor = bz2.BZ2Decompressor()

    fp = os.path.join(os.path.expanduser(config.SYNT_PATH), db_name)

    if os.path.exists(fp):
        os.remove(fp)

    db = db_init(db=db_name, create=False)
    db.set_progress_handler(import_progress, 20)

    while True:
        seconds = time.time() - start_time
        chunk = response.read(8192)

        if not chunk:
            break

        saved_bytes += len(chunk)
        data_buffer.write(decompressor.decompress(chunk))

        if seconds > 1:
            percent = round((float(saved_bytes) / total_bytes) * 100, 2)
            speed = round((float(total_bytes / seconds) / 1024), 2)
            speed_type = "Kb/s"

            if speed > 1000:
                speed = round((float(total_bytes / seconds) / 1048576), 2)
                speed_type = "Mb/s"

            if last_seconds >= 0.5:
                last_seconds = 0
                last_seconds_start = time.time()
                print(
                    "Downloaded %d of %d Mb, %s%s (%0.2f%%)\r"
                    % (saved_bytes / 1048576, total_bytes / 1048576, speed, speed_type, percent)
                )
            else:
                last_seconds = time.time() - last_seconds_start

        if saved_bytes == total_bytes:
            print(
                "Downloaded %d of %d Mb, %s%s (100%%)\r"
                % (saved_bytes / 1048576, total_bytes / 1048576, speed, speed_type)
            )

            try:
                db.executescript(data_buffer.getvalue())
            except Exception, e:
                print("Sqlite3 import failed with: %s" % e)
                break
Esempio n. 9
0
def collect(db_name="", commit_every=1000, max_collect=400000, queries_file=""):
    """
    Will continuously populate the sample database if it exists
    else it will create a new one.
    
    Keyword Arguments:
    db_name (str) -- Custom name for database.
    commit_every (int) -- Commit to sqlite after commit_every executes.
    max_collect (int) -- Will stop collecting at this number.
    queries_file (str) -- If queries file is provided should be a path to a text file
                          containing the queries in the format:
                          
                          label 
                          query1
                          queryN

    """

    if not db_name:
        d = datetime.datetime.now()
        # if no dbname is provided we'll store a timestamped db name
        db_name = "samples-%s-%s-%s.db" % (d.year, d.month, d.day)

    db = db_init(db=db_name)
    cursor = db.cursor()

    queries = {}

    if queries_file:
        try:
            f = open(queries_file)
            words = [line.strip() for line in f.readlines()]
            label = words[0]
            for w in words:
                queries[w] = label
        except IOError:
            pass

    else:
        queries[":)"] = "positive"
        queries[":("] = "negative"

    # collect on twitter with kral
    g = stream(query_list=queries.keys(), service_list="twitter")

    c = 0
    for item in g:

        text = unicode(item["text"])

        sentiment = queries.get(item["query"], None)

        if sentiment:
            try:
                cursor.execute("INSERT INTO item VALUES (NULL,?,?)", [text, sentiment])
                c += 1
                if c % commit_every == 0:
                    db.commit()
                    print("Commited {}".format(commit_every))
                if c == max_collect:
                    break
            except IntegrityError:  # skip duplicates
                continue

    db.close()
Esempio n. 10
0
def fetch(db_name='samples.db'):
    """
    Pre-populates training database from public archive of ~2mil tweets.
    Stores training database as db_name in ~/.synt/

    Keyword Arguments:
    db_name (str) -- Custom name for database.

    """

    response = urllib2.urlopen('https://github.com/downloads/Tawlk/synt/sample_data.bz2')

    total_bytes = int(response.info().getheader('Content-Length').strip())
    saved_bytes = 0
    start_time = time.time()
    last_seconds = 0
    last_seconds_start = 0
    data_buffer = StringIO()

    decompressor = bz2.BZ2Decompressor()

    fp = os.path.join(os.path.expanduser(config.SYNT_PATH), db_name)

    if os.path.exists(fp):
        os.remove(fp)

    db = db_init(db=db_name, create=False)
    db.set_progress_handler(import_progress,20)

    while True:
        seconds = (time.time() - start_time)
        chunk = response.read(8192)

        if not chunk:
            break

        saved_bytes += len(chunk)
        data_buffer.write(decompressor.decompress(chunk))

        if seconds > 1:
            percent = round((float(saved_bytes) / total_bytes)*100, 2)
            speed = round((float(total_bytes / seconds ) / 1024),2)
            speed_type = 'Kb/s'

            if speed > 1000:
                speed = round((float(total_bytes / seconds ) / 1048576),2)
                speed_type = 'Mb/s'

            if last_seconds >= 0.5:
                last_seconds = 0
                last_seconds_start = time.time()
                print("Downloaded %d of %d Mb, %s%s (%0.2f%%)\r" % (saved_bytes/1048576, total_bytes/1048576, speed, speed_type, percent))
            else:
                last_seconds = (time.time() - last_seconds_start)

        if saved_bytes == total_bytes:
            print("Downloaded %d of %d Mb, %s%s (100%%)\r" % (saved_bytes/1048576, total_bytes/1048576, speed, speed_type))

            try:
                db.executescript(data_buffer.getvalue())
            except Exception, e:
                print("Sqlite3 import failed with: %s" % e)
                break
Esempio n. 11
0
def collect(db_name='', commit_every=1000, max_collect=400000, query_file=''):
    """
    Will continuously populate the sample database if it exists
    else it will create a new one.

    Keyword Arguments:
    db_name (str) -- Custom name for database.
    commit_every (int) -- Commit to sqlite after commit_every executes.
    max_collect (int) -- Will stop collecting at this number.
    query_file (str) -- If query file is provided should be absolute path to text file.
    """

    #collect requires kral
    try:
        from kral import stream
    except ImportError:
        raise ImportError("Requires the kral package in order to collect.")

    if not db_name:
        d = datetime.datetime.now()
        #if no dbname is provided we'll store a timestamped db name
        db_name = "samples-%s-%s-%s.db" % (d.year, d.month, d.day)

    db = db_init(db=db_name)
    cursor = db.cursor()

    queries = {}

    if query_file:
        if not os.path.exists(query_file):
            return "Query file path does not exist."

        f = open(query_file)
        words = [line.strip() for line in f.readlines()]
        label = words[0]
        for w in words:
            queries[w] = label

    else:
        queries[':)'] =  'positive'
        queries[':('] =  'negative'

    #collect on twitter with kral
    g = stream(query_list=queries.keys(), service_list="twitter")

    c = 0
    for item in g:

        text = unicode(item['text'])

        sentiment = queries.get(item['query'], None)

        if sentiment:
            try:
                cursor.execute('INSERT INTO item VALUES (NULL,?,?)', [text, sentiment])
                c += 1
                if c % commit_every == 0:
                    db.commit()
                    print("Commited {}".format(commit_every))
                if c == max_collect:
                    break
            except IntegrityError: #skip duplicates
                continue

    db.close()