def setUp(self):
        database_factory = DatabaseFactory(settings.SQL_HOST, settings.SQL_PORT,
                                           settings.SQL_USER, settings.SQL_PASSWD,
                                           settings.SQL_DB, settings.SQL_COLLECTION_NAME)

        self._cache_db = database_factory.get_database(CollectionTypes.cache)
        self._history_db = database_factory.get_database(CollectionTypes.history)
        self._parser = websites_parses.get(self.url_from)
    def setUp(self):
        database_factory = DatabaseFactory(settings.SQL_HOST, settings.SQL_PORT,
                                           settings.SQL_USER, settings.SQL_PASSWD,
                                           settings.SQL_DB, settings.SQL_COLLECTION_NAME)

        self._cache_db = database_factory.get_database(CollectionTypes.cache)
        self._history_db = database_factory.get_database(CollectionTypes.history)
        self._item_db = database_factory.get_database(CollectionTypes.item)
        self._page_db = database_factory.get_database(CollectionTypes.page)

        self.page_url = 'http://indianexpress.com/article/opinion/columns/burhan-wani-death-kashmir-protests-mehbooba-mufti-hurriyat-2909858/'
        self.url_from = WebsiteTypes.indianexpress.value
    def setUp(self):
        database_factory = DatabaseFactory(settings.SQL_HOST, settings.SQL_PORT,
                                           settings.SQL_USER, settings.SQL_PASSWD,
                                           settings.SQL_DB, settings.SQL_COLLECTION_NAME)

        self._page_db = database_factory.get_database(CollectionTypes.page)

        self.dict = {
            WebsiteTypes.dailyo.value: 'http://www.dailyo.in/politics?page={}',
            WebsiteTypes.firstpost.value: 'http://www.firstpost.com/politics?page={}',
        }
        self.page_index = 1
        self.wp_index = 0
        self.scraped_pages_count = 20
        self.wp_status = WholePagesStatus(self.dict, self._page_db, self.scraped_pages_count)
def generateCurves(p_min_freq, p_max_freq):
    db = DatabaseFactory.getDatabase(database)

    l_min_freq = int(p_min_freq)
    l_max_freq = int(p_max_freq)

    keywords = KeywordsUtil.pickAllInFrequencyRange(l_min_freq, l_max_freq)

    t = time.time()
    with open(csvFileName, 'w') as csvFile:
        csvWriter = csv.writer(csvFile,
                               delimiter=',',
                               quotechar='"',
                               quoting=csv.QUOTE_MINIMAL)

        progress = 0
        skipped = 0
        for keyword in keywords:
            progress += 1
            curve = []
            curve.extend(keyword)

            # 1. Get total coordinates of this keyword
            t0 = time.time()
            totalCoordinates = db.GetCoordinate(tableName, keyword[0], -1)
            if len(totalCoordinates) == 0:
                skipped += 1
                continue
            if len(totalCoordinates) != keyword[1]:
                print '[' + keyword[0] + ']', \
                    'frequency = ' + str(keyword[1]) + ', len(perfect) = ' + str(len(totalCoordinates))

            # 2. for each k percentage calculate the quality value
            t1 = time.time()
            for k_percentage in k_percentages:
                similarity = QualityUtil.phQualityOfKPercentage(
                    totalCoordinates, k_percentage, x_scale, y_scale)
                curve.append(similarity)

            # 3. write down the curve of this keyword to csv file
            # [keyword, frequency, q(5%), q(10%), ......, q(95%)]
            t2 = time.time()
            csvWriter.writerow(curve)
            t3 = time.time()
            # 4. output time information to console
            print keyword[0], \
                ", [freq]", keyword[1], \
                ", [query]", t1 - t0, \
                ", [curve]", t2 - t1, \
                ", [write]", t3 - t2, \
                ", [total]", time.time() - t0, \
                ", [All]", time.time() - t, \
                ", [progress]", str(progress * 100 / len(keywords)) + '%'

    return len(keywords) - skipped
Beispiel #5
0
def generateKValues(p_min_freq, p_max_freq):
    db = DatabaseFactory.getDatabase(dbtype, database)

    l_min_freq = int(p_min_freq)
    l_max_freq = int(p_max_freq)

    keywords = KeywordsUtil.pickInFrequencyRange(db, l_min_freq, l_max_freq)

    t = time.time()
    with open(csvFileName_k_values, 'w') as csvFile_k_values:
        with open(csvFileName_k_ratios, 'w') as csvFile_k_ratios:
            csvWriter_k_values = csv.writer(csvFile_k_values, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            csvWriter_k_ratios = csv.writer(csvFile_k_ratios, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

            progress = 0
            skipped = 0
            for keyword in keywords:
                progress += 1
                k_values = []
                k_ratios = []
                k_values.append(keyword)
                k_ratios.append(keyword)

                # 1. Get total coordinates of this keyword
                t0 = time.time()
                totalCoordinates = db.GetCoordinate(tableName, keyword, -1)
                if len(totalCoordinates) == 0:
                    skipped += 1
                    continue

                # 2. for each target quality calculate the k value using binary search
                t1 = time.time()
                for quality in qualities:
                    k, kp = QualityUtil.findKOfQuality(totalCoordinates, float(quality)/100.0, x_scale, y_scale)
                    k_values.append(k)
                    k_ratios.append(kp)

                # 3. write down the k_values / k_ratios of this keyword to csv file
                # [keyword, frequency, k(0.7), k(0.75), ......, k(0.95)]
                # [keyword, frequency, kp(0.7), kp(0.75), ......, kp(0.95)]
                t2 = time.time()
                csvWriter_k_values.writerow(k_values)
                csvWriter_k_ratios.writerow(k_ratios)
                t3 = time.time()
                # 4. output time information to console
                print keyword, \
                    ", [query]", t1 - t0, \
                    ", [k_values/ratios]", t2 - t1, \
                    ", [write]", t3 - t2, \
                    ", [total]", time.time() - t0, \
                    ", [All]", time.time() - t, \
                    ", [progress]", str(progress * 100 / len(keywords)) + '%'

    return len(keywords) - skipped
Beispiel #6
0
def testDrawHeat(p_keyword):
    db = DatabaseFactory.getDatabase(Conf.DBTYPE)
    totalCoordinates = np.array(db.GetCoordinate(Conf.TABLE, p_keyword, -1))
    if len(totalCoordinates) == 0:
        return
    img = QualityUtil.coordinatesToImage(totalCoordinates)
    DrawHeat(np.fliplr(img).transpose(), 'no', 'heat_' + p_keyword + '.png')
    DrawHeat(
        np.fliplr(img).transpose(), 'sum', 'heat_' + p_keyword + '_sum.png')
    DrawHeat(
        np.fliplr(img).transpose(), 'max', 'heat_' + p_keyword + '_max.png')
def run(p_database, p_tableName, p_withOrderBy):

    db = DatabaseFactory.getDatabase(p_database)

    # [k, avg_time(word[1]), avg_time(word[2]), ..., avg_time(word[N])]
    execTime_All = []
    shuffled_keywords = copy.deepcopy(keywords)
    for k_percent in k_percentages:
        execTime_k = [k_percent]
        # sum_runs = {word1: time, word2: time, ...}
        sum_runs = {}
        # Average several runs
        for i in range(num_runs):
            print 'k percentage ==== ', str(k_percent), '  running ', str(i +
                                                                          1)
            # restart the MySQL server
            db.restart()

            # send dummy queries to warm up the database
            startT = time.time()
            db.queryDummy()
            endT = time.time()
            print 'dummy query takes ', str(endT - startT), ' seconds.'

            # every run shuffle the order of querying keywords
            random.shuffle(shuffled_keywords)
            # (word, count)
            for keyword in shuffled_keywords:
                k = int(round(keyword[1] * k_percent))
                if p_withOrderBy:
                    execTime = execTimeLimitKAndOrderBy(
                        db, p_tableName, keyword[0], k, orderBy)
                else:
                    execTime = execTimeLimitK(db, p_tableName, keyword[0], k)
                if keyword[0] in sum_runs:
                    sum_runs[keyword[0]] += execTime
                else:
                    sum_runs[keyword[0]] = execTime

        # print sum_runs
        # Use defined order of keywords to store the average execution time
        for keyword in keywords:
            # print 'sums of [', keyword[0], '] = ', str(sum_runs[keyword[0]])
            execTime_k.extend([sum_runs[keyword[0]] / num_runs])

        execTime_All.append(execTime_k)

        # print all results until now
        print_execTime(execTime_All)

    return execTime_All
Beispiel #8
0
def collectWordsCardinalities():
    db = DatabaseFactory.getDatabase(database)
    keywords = KeywordsUtil.pickAllInFrequencyRange(min_freq, max_freq)
    csvFileName = 'wordcardinality.csv'
    with open(csvFileName, 'w') as csvFile:
        csvWriter = csv.writer(csvFile,
                               delimiter=',',
                               quotechar='"',
                               quoting=csv.QUOTE_MINIMAL)
        progress = 0
        t0 = time.time()
        for keyword in keywords:
            count = db.GetCount(tableName, keyword[0])
            csvWriter.writerow([keyword[0], count])
            progress += 1
            t1 = time.time()
            print "Total time:", t1 - t0, "Progress:", str(
                progress * 100 / len(keywords)) + '%'
Beispiel #9
0
def collectWordCountForAsterixDB(p_limit):
    db = DatabaseFactory.getDatabase('AsterixDB')
    csvFileName = 'wordcount.csv'
    wordcount = Counter()

    # 1. get the cardinality of the whole table
    print "[1] Get the cardinality of the whole table ......"
    sql = "select count(*) as count from limitdb." + tableName
    results = db.query(sql)
    tableCardinality = results[0]['count']

    # 2. traverse the whole table by limit and offset and count tokens
    print "[2] Traverse the whole table by limit and offset and count tokens ......"
    progress = 0
    t0 = time.time()
    for offset in range(0, tableCardinality, p_limit):
        sql = "select word_tokens(t.text) as tokens from limitdb." + tableName + \
              " t limit " + str(p_limit) + " offset " + str(offset)
        # [{'tokens': ['t1', 't2']}, {'tokens':['t3', 't4', 't5']}]
        results = db.query(sql)
        for record in results:
            # get rid of stop words
            tokens = [
                word for word in record['tokens']
                if word not in stopwords.words('english') and len(word) > 2
            ]
            wordcount.update(Counter(tokens))

        progress += p_limit
        t1 = time.time()
        print "Total time:", t1 - t0, "Progress:", str(
            float(progress) * 100 / float(tableCardinality)) + '%'

    # 3. write to csv file
    print "[3] Writing wordcount dictionary into csv file", csvFileName, "......"
    with open(csvFileName, 'w') as csvFile:
        csvWriter = csv.writer(csvFile,
                               delimiter=',',
                               quotechar='"',
                               quoting=csv.QUOTE_MINIMAL)
        for key, value in sorted(wordcount.iteritems(),
                                 key=lambda (k, v): (v, k),
                                 reverse=True):
            csvWriter.writerow([key, value])
Beispiel #10
0
import Conf
import DatabaseFactory
import numpy as np
import QualityUtil

###########################################################
#   Configurations
###########################################################
database = Conf.DBTYPE
tableName = Conf.TABLE

db = DatabaseFactory.getDatabase(database)


# Given a keyword, use binary search to find the limit k value for target quality with given r value
# p_keyword - keyword
# p_quality - target quality value, float in [0, 1]
# p_r - r value for hybrid approach with condition random() < r in the query, float in [0, 1]
# return [limit k value, limit k ratio] with which we can get target quality
#        [-1, -1] if with maximum k value, we still cannot get target quality
def findKOfQuality(p_keyword, p_quality, p_r, p_totalCoordinates):

    # 1. Get perfect image if not indicated by argument
    if p_totalCoordinates:
        totalCoordinates = p_totalCoordinates
    else:
        totalCoordinates = db.GetCoordinate(tableName, p_keyword, -1)

    # 2. If max possible image with given r value is smaller than p_quality, return [-1, -1]
    maxCoordinates = db.GetCoordinateHybrid(tableName, p_keyword, p_r,
                                            int(len(totalCoordinates) * p_r))
Beispiel #11
0
    'word': 'soccer'
}, {
    'database': 'limitdb2',
    'table_name': 'coord_tweets',
    'quality_function': 'PH',
    'word': 'soccer'
}]
x_label = 'K Percentage (%)'
y_label = 'Quality'
title = 'K-Q curves of "soccer" on different data size'

# 2. Get the curves from corresponding database
db = None
curves = []
for targetCurve in targetCurves:
    database = targetCurve['database']
    if db is None:
        db = DatabaseFactory.getDatabase(dbtype, database)
    elif db.getDatabase() != database:
        db.close()
        db = DatabaseFactory.getDatabase(dbtype, database)

    curve = db.queryCurve(targetCurve['table_name'],
                          targetCurve['quality_function'], targetCurve['word'])
    curves.append(curve)

print curves
# 3. plot the curves
PlotUtil.plotCurves(pdfBasePath + '/' + fileName, labels, x, curves, x_label,
                    y_label, title)
Beispiel #12
0
def buildModel(p_database, p_tableName, p_min_freq, p_max_freq):

    db = DatabaseFactory.getDatabase(p_database)

    l_min_freq = int(p_min_freq)
    l_max_freq = int(p_max_freq)

    keywords = KeywordsUtil.pickLowestInFrequencyRange(l_min_freq, l_max_freq,
                                                       2000)

    # [word, count, k, ratio, quality]
    keyword_models = []

    t = time.time()
    if len(keywords) > 0:
        for keyword in keywords:
            t0 = time.time()
            # create perfect image of the file
            ar = np.array(db.GetCoordinate(p_tableName, keyword[0], -1))
            if len(ar) == 0:
                continue
            t1 = time.time()
            H = hashByNumpy(ar)
            perfectLen = np.count_nonzero(H)
            i = 0.0
            low = 0.0
            high = 100.0
            similarity = 0.0
            iterTimes = 0
            t2 = time.time()
            while (similarity < Q0
                   or similarity > Q0 + 0.01) and iterTimes < 10:
                # binary search for the target K0 for target quality 0.85
                if similarity < Q0:
                    low = i
                    i = (high + i) / 2
                else:
                    high = i
                    i = (i + low) / 2
                k = int(i * len(ar) / 100)
                Hs = hashByNumpy(ar[:k])
                sampleLen = np.count_nonzero(Hs)
                similarity = float(sampleLen) / perfectLen
                iterTimes += 1
            print keyword[0], \
                "quality:", similarity, \
                "k:", k, \
                "ratio:", i, \
                "fetch:", t1 - t0, \
                "draw:", t2 - t1, \
                "search:", time.time() - t2
            keyword_models.append([keyword[0], keyword[1], k, i, similarity])

    with open('keyword_models.csv', 'w') as csvFile:
        csvWriter = csv.writer(csvFile,
                               delimiter=',',
                               quotechar='|',
                               quoting=csv.QUOTE_MINIMAL)
        for line in keyword_models:
            csvWriter.writerow(line)
    print "Total time of", p_tableName, ":", time.time() - t
import Conf
import DatabaseFactory
import KeywordsUtil
import Modeler
import PlotUtil
import numpy as np
import json

###########################################################
#   Configurations
###########################################################
dbType = Conf.DBTYPE
databaseName = Conf.DATABASE
tableName = Conf.TABLE

db = DatabaseFactory.getDatabase(dbType)

# From what frequency, choose keywords
frequencies = [
    100000, 500000, 1000000, 2000000, 3000000, 5000000, 8000000, 10000000,
    12000000, 15000000, 20000000
]
# For each frequency, how many keywords we choose
numOfKeywords = 3

# Target Quality
quality = 0.85

reversed_order = False

order_suffix = 'asc'
Beispiel #14
0
                        default=0.2)  #验证集大小
    parser.add_argument("-epo",
                        "--epochs",
                        help="# Epochs to generate pairs for",
                        type=int,
                        default=25)  #生成成对的时期

    try:
        args = parser.parse_args()
    except:
        parser.print_help()
        print(debug_msg())
        exit(0)

    if args.build:
        print("Disassemblying files and creating dataset")
        print(build_configuration(args.db, args.dir, args.symbols))
        factory = DatabaseFactory.DatabaseFactory(args.db, args.dir)
        factory.build_db(args.symbols)

    if args.split:
        print("Splitting data and generating epoch pairs")
        print(
            split_configuration(args.db, args.val_size, args.test_size,
                                args.epochs))
        splitter = DataSplitter.DataSplitter(args.db)
        splitter.split_data(args.val_size, args.test_size)
        splitter.create_pairs(args.epochs)

    exit(0)