Ejemplo n.º 1
0
def tabulate2CSV(outPath, metric, k=None, datatype='float'):
    allYears = set()
    for conferenceName in conferences:
        allYears.update(metrics[conferenceName].getMetric(metric, k).keys())

    f = open(outPath, 'wb')
    writer = UnicodeWriter(f)
    header = ['year'] + [c.upper() for c in conferences]
    headerStr = [str(item) for item in header]
    writer.writerow(headerStr)
    for year in reversed(sorted(allYears)):
        row = [year]
        for conferenceName in conferences:
            try:
                if datatype == 'float':
                    row.append(
                        '%.03f' %
                        metrics[conferenceName].getMetric(metric, k)[year])
                elif datatype == 'int':
                    row.append(metrics[conferenceName].getMetric(metric,
                                                                 k)[year])
            except:
                row.append('')
        rowStr = [str(item) for item in row]
        writer.writerow(rowStr)
    f.close()
Ejemplo n.º 2
0
def scatterPlot(outPath, metric1, metric2, k1=None, k2=None):
    f = open(outPath, 'wb')
    writer = UnicodeWriter(f)
    writer.writerow(
        ['year',
         '%s%d' % (metric1, k1),
         '%s%d' % (metric2, k2), 'conference'])
    for conferenceName in conferences:
        for year in reversed(
                sorted(
                    set(metrics[conferenceName].getMetric(
                        metric1, k1).keys()).intersection(
                            set(metrics[conferenceName].getMetric(
                                metric2, k2).keys())))):
            x = metrics[conferenceName].getMetric(metric1, k1)[year]
            y = metrics[conferenceName].getMetric(metric2, k2)[year]
            row = [year, x, y, conferenceName]
            rowStr = [str(item) for item in row]
            writer.writerow(rowStr)
    f.close()
Ejemplo n.º 3
0
def main():

    data = "../resources/SOusers-Mar13.csv"  # File containing SO user dump
    results = "../resources/features3.csv"  # File where features will be stored
    picPath = "../resources/SOpictures/"  # Directory where pictures will be downloaded

    fr = open(os.path.join(data), 'rb')
    fw = open(os.path.join(results), 'ab')

    if _RANDOM:
        reader = RandomReader(fr)
    else:
        reader = UnicodeReader(fr)

    writer = UnicodeWriter(fw)

    queue = Queue()
    if _FACE:
        faceDetector = FaceDetector()

    threads = []
    SOhashes = {}  # Dictionary of user's hashes

    # Use multiple threads to download and get information
    for i in xrange(10):
        threads.append(Downloader(queue))
        threads[-1].start()

    idx = 0
    size = 4500  # Number of subjects

    for row in reader:
        if idx < size:
            so_uid = row[0]
            so_hash = row[2]
            if (not (SOhashes.has_key(so_hash))):
                SOhashes[so_hash] = so_uid
                if (not isDefaultGravatarPic(so_hash)):
                    data = [so_uid]
                    if _VISUAL_FEATURES:

                        # Download picture
                        filepath = os.path.join('%s%d.jpg' %
                                                (picPath, int(so_uid)))
                        if not os.path.isfile(filepath):
                            queue.put(
                                ('http://www.gravatar.com/avatar/%s' % so_hash,
                                 filepath))
                            time.sleep(2)

                        # Load picture
                        pic = picUtils.loadPicture(filepath)

                        if _FACE:
                            if faceDetector.isFrontFace(
                                    pic) or faceDetector.isProfileFace(pic):
                                data.append(str(True))
                            else:
                                data.append(str(False))

                        if _MOST_COMMON_COLORS:
                            _, f1, _, f2 = picUtils.mostCommonColor(pic)
                            data.append(str(f1 + f2))

                        if _NBCOLORS:
                            data.append(str(picUtils.getNbOfColors(pic)))

                        if _FARTHEST_NEIGHBOR:
                            F1 = picUtils.farthestNeighborMetric(pic, 10)
                            F2 = picUtils.farthestNeighborMetric(pic, 200)
                            data.append(str(F1))
                            data.append(str(F2))
                            if F1 != 0:
                                data.append(str(F2 / F1))
                            else:
                                data.append('?')

                        if _AVERAGE_SATURATION:
                            data.append(str(picUtils.avgSaturation(pic)))

                        if _THRESHOLD_BRIGHTNESS:
                            data.append(str(picUtils.threBrightness(pic, 0.2)))

                    if _GOOGLE:
                        gi = GoogleImage('http://www.gravatar.com/avatar/%s' %
                                         so_hash)
                        bestGuess = gi.getBestGuess()
                        if bestGuess:
                            bestGuess = bestGuess.encode('utf8')
                            data.append(bestGuess)
                            if _WIKIPEDIA:
                                gs = GoogleSearch("%s site:en.wikipedia.org" %
                                                  bestGuess)
                                wikiTitlePage = gs.getWikipediaTitlePage()
                                if wikiTitlePage:
                                    wiki = Wikipedia(wikiTitlePage)
                                    wiki.categoryGraph(4)
                                    nbCats = 10
                                    i = 0
                                    cats = wiki.sortGraphByDegree()
                                    while i < nbCats and i < len(cats):
                                        data.append(str(cats[i]))
                                        i += 1

                    # Write all information collected in the csv file
                    try:
                        print data
                        writer.writerow(data)
                        idx += 1
                    except:
                        print "Error with data"
        else:
            break
    fr.close()
    fw.close()

    # If here, download finished. Stop threads
    for i in xrange(10):
        queue.put((None, None))
Ejemplo n.º 4
0
    print c, d

    pc_c = metrics[c].pcPerYear
    pc_d = metrics[d].pcPerYear

    a_c = metrics[c].authorsPerYear
    a_d = metrics[d].authorsPerYear

    cm_c = metrics[c].membersPerYear
    cm_d = metrics[d].membersPerYear

    allYears = set(cm_c.keys()).intersection(cm_d.keys())

    outPath = os.path.join(metricsPath, 'pairwise', '%s_%s.csv' % (c, d))
    f = open(outPath, 'wb')
    writer = UnicodeWriter(f)
    header = [
        'YEAR', 'PC1', 'PC2', 'PC1_INT_PC2', 'PC1_UNI_PC2',
        'PC1_INT_PC2__REL__PC1_UNI_PC2', 'PC1_INT_PC2__REL__PC1',
        'PC1_INT_PC2__REL__PC2', 'A1', 'A2', 'A1_INT_A2', 'A1_UNI_A2',
        'A1_INT_A2__REL__A1_UNI_A2', 'A1_INT_A2__REL__A1',
        'A1_INT_A2__REL__A2', 'CM1', 'CM2', 'CM1_INT_CM2', 'CM1_UNI_CM2',
        'CM1_INT_CM2__REL__CM1_UNI_CM2', 'CM1_INT_CM2__REL__CM1',
        'CM1_INT_CM2__REL__CM2'
    ]
    writer.writerow(header)

    for year in reversed(sorted(allYears)):
        pc_c_int_pc_d = pc_c[year].intersection(pc_d[year])
        pc_c_uni_pc_d = pc_c[year].union(pc_d[year])
Ejemplo n.º 5
0
def buildSampleSet(inputFile, sampleFile):

    f = open(os.path.join(inputFile), "rb")
    f1 = open(os.path.join(sampleFile), "wb")

    reader = RandomReader(f)
    writer = UnicodeWriter(f1)

    nbRows = 0
    categories = []
    countPages = []
    for row in reader:
        nbRows += 1
        for cat in row[15:25]:
            if cat != "?":
                if not cat in categories:
                    categories.append(cat)
                    countPages.append(0)
    data = []

    for row in reader:
        line = []
        for d in row[0:15]:
            line.append(d)
        for ind, cat in enumerate(categories):
            if cat in row[15:25]:
                countPages[ind] += 1
                line.append(str(1))
            else:
                line.append(str('?'))
        data.append(line)

    i = 0
    filteredCategories = []
    for cat, cpt in zip(categories, countPages):
        if cpt < 6:  # Filter categories and keep only those which have more than 6 subjects in it
            ind = 15 + i
            for d in data:
                d.pop(ind)
        else:
            filteredCategories.append(cat)
            i += 1

    # Header
    header = [
        "uid", "cl1", "cl2", "cl3", "cl4", "cl5", "face", "fCols", "nbCols",
        "f1", "f2", "f3", "s", "b", "bestGuess"
    ]
    domain = [
        "c", "d", "d", "d", "d", "d", "d", "c", "c", "c", "c", "c", "c", "c",
        "string"
    ]
    attribut = ["m", "c", "m", "m", "m", "m"]

    for cat in filteredCategories:
        header.append(cat)
        domain.append("d")

    writer.writerow(header)
    writer.writerow(domain)
    writer.writerow(attribut)
    i = 0
    for row in data:
        writer.writerow(row)
        i += 1
    f.close()
    f1.close()