Ejemplo n.º 1
0
def parse_dblp_old(name, paper_type):
  f = open(os.path.join(dataPath, 'web2csv', paper_type, '%s.csv'%name), "wb")
  writer = UnicodeWriter(f)
  conf_path = os.path.join(dataPath, 'web', paper_type, name)
  for file_name in reversed(os.listdir(conf_path)):
    file_path = os.path.join(conf_path, file_name)
    print "**** %s ****" % file_path
    tree = ET.parse(file_path)
    root = tree.getroot()
    headers = root.findall('h2')
    lists = root.findall('ul')
    assert len(headers) == len(lists)
    for header, lst in zip(headers, lists):
      topic = header.text.replace("\n", "")
      for item in lst.findall("li"):
        try:
          # year, authors, title, pages, pageCount = format_dblp_xml(base_uri + item.attrib["id"] + ".xml")
          year, authors, title, pages, page_count, doi = format_dblp_xml(get_link_uri(item))
        except urllib2.HTTPError, e:
          if e.code == 429:
            print "SLEEPING FOR 1 min"
            time.sleep(60)
            year, authors, title, pages, page_count, doi = format_dblp_xml(get_link_uri(item))
            # year, authors, title, pages, page_count = format_dblp_xml(base_uri + item.attrib["id"] + ".xml")
          else:
            raise
        writer.writerow([year, authors, title, doi, pages, page_count, topic, ""])
Ejemplo n.º 2
0
def parse_dblp_new(name, volumes, paper_type, base_uri=JOURNAL_BASE_URI, f_name=None):
  if f_name is None:
    f_name = name
  f = open(os.path.join(dataPath, 'web2csv', paper_type, '%s.csv' % f_name), "wb")
  writer = UnicodeWriter(f)
  for volume in volumes:
    uri = "%s/%s/%s%d.html" % (base_uri, name, name, volume)
    print(uri)
    resp = requests.get(uri)
    if resp.status_code != 200:
      print(resp.headers)
      print("Status code: %d for %s%d.\nVerify URI: %s" % (resp.status_code, name, volume, uri))
      return
    soup = Soup(resp.content, "html").html
    current = soup.find("body").find("h2").parent
    topic = get_text(current)
    while True:
      current = current.find_next_sibling()
      if current.name == "ul":
        for li in current.find_all("li", recursive=False):
          xml_uri = "%s/%s.xml" % (JOURNAL_XML_BASE_URI, li.attrs['id'])
          try:
            year, authors, title, pages, page_count, doi = format_dblp_xml(xml_uri)
          except urllib2.HTTPError, e:
            if e.code == 429:
              print "SLEEPING FOR 1 min"
              time.sleep(60)
              year, authors, title, pages, page_count, doi = format_dblp_xml(xml_uri)
            else:
              raise
          writer.writerow([year, authors, title, doi, pages, page_count, topic, ""])
      elif current.name == "header":
        topic = get_text(current)
      if current.name == "div":break
Ejemplo n.º 3
0
def scatterPlot(outPath, metric1, metric2, k1=None, k2=None):
    f = open(outPath, 'wb')
    writer = UnicodeWriter(f)
    writer.writerow(['year', '%s%d'%(metric1,k1), '%s%d'%(metric2,k2), 'conference'])
    for conferenceName in conferences:        
        for year in reversed(sorted(set(metrics[conferenceName].getMetric(metric1,k1).keys()).intersection(set(metrics[conferenceName].getMetric(metric2,k2).keys())))):
            x = metrics[conferenceName].getMetric(metric1,k1)[year]
            y = metrics[conferenceName].getMetric(metric2,k2)[year]
            row = [year, x, y, conferenceName]
            rowStr = [str(item) for item in row]
            writer.writerow(rowStr)
    f.close()
Ejemplo n.º 4
0
def parse_csv(conf):
  f = open(os.path.join(dataPath, 'web2csv', 'conferences', '%s_pc.csv'%conf), "wb")
  writer = UnicodeWriter(f)
  folder_path = os.path.join(dataPath, 'web_pc', 'conferences', conf)
  for file_name in reversed(os.listdir(folder_path)):
    file_path = os.path.join(folder_path, file_name)
    print "**** %s ****" % file_path
    year = file_name.split("_")[1].split(".")[0]
    with open(file_path, 'r') as csvfile:
      reader = UnicodeReader(csvfile)
      for row in reader:
        writer.writerow([year, "main", row[0].strip()] + [cell.strip() for cell in row[1].strip().split(",")])
  f.close()
Ejemplo n.º 5
0
def tabulate2CSV(outPath, metric, k=None, datatype='float'):
    allYears = set()
    for conferenceName in conferences:
        allYears.update(metrics[conferenceName].getMetric(metric, k).keys())

    f = open(outPath, 'wb')
    writer = UnicodeWriter(f)
    header = ['year'] + [c.upper() for c in conferences]
    headerStr = [str(item) for item in header]
    writer.writerow(headerStr)
    for year in reversed(sorted(allYears)):
        row = [year]
        for conferenceName in conferences:
            try:
                if datatype == 'float':
                    row.append(
                        '%.03f' %
                        metrics[conferenceName].getMetric(metric, k)[year])
                elif datatype == 'int':
                    row.append(metrics[conferenceName].getMetric(metric,
                                                                 k)[year])
            except:
                row.append('')
        rowStr = [str(item) for item in row]
        writer.writerow(rowStr)
    f.close()
Ejemplo n.º 6
0
def tabulate2CSV(outPath, metric, k=None, datatype='float'):
    allYears = set()
    for conferenceName in conferences:
        allYears.update(metrics[conferenceName].getMetric(metric,k).keys())

    f = open(outPath, 'wb')
    writer = UnicodeWriter(f)
    header = ['year'] + [c.upper() for c in conferences]
    headerStr = [str(item) for item in header]
    writer.writerow(headerStr)
    for year in reversed(sorted(allYears)):
        row = [year]
        for conferenceName in conferences:
            try:
                if datatype == 'float':
                    row.append('%.03f' % metrics[conferenceName].getMetric(metric,k)[year])
                elif datatype == 'int':
                    row.append(metrics[conferenceName].getMetric(metric,k)[year])
            except:
                row.append('')
        rowStr = [str(item) for item in row]
        writer.writerow(rowStr)
    f.close()    
Ejemplo n.º 7
0
def scatterPlot(outPath, metric1, metric2, k1=None, k2=None):
    f = open(outPath, 'wb')
    writer = UnicodeWriter(f)
    writer.writerow(
        ['year',
         '%s%d' % (metric1, k1),
         '%s%d' % (metric2, k2), 'conference'])
    for conferenceName in conferences:
        for year in reversed(
                sorted(
                    set(metrics[conferenceName].getMetric(
                        metric1, k1).keys()).intersection(
                            set(metrics[conferenceName].getMetric(
                                metric2, k2).keys())))):
            x = metrics[conferenceName].getMetric(metric1, k1)[year]
            y = metrics[conferenceName].getMetric(metric2, k2)[year]
            row = [year, x, y, conferenceName]
            rowStr = [str(item) for item in row]
            writer.writerow(rowStr)
    f.close()
Ejemplo n.º 8
0
    print c, d

    pc_c = metrics[c].pcPerYear
    pc_d = metrics[d].pcPerYear

    a_c = metrics[c].authorsPerYear
    a_d = metrics[d].authorsPerYear

    cm_c = metrics[c].membersPerYear
    cm_d = metrics[d].membersPerYear

    allYears = set(cm_c.keys()).intersection(cm_d.keys())

    outPath = os.path.join(metricsPath, 'pairwise', '%s_%s.csv' % (c, d))
    f = open(outPath, 'wb')
    writer = UnicodeWriter(f)
    header = [
        'YEAR', 'PC1', 'PC2', 'PC1_INT_PC2', 'PC1_UNI_PC2',
        'PC1_INT_PC2__REL__PC1_UNI_PC2', 'PC1_INT_PC2__REL__PC1',
        'PC1_INT_PC2__REL__PC2', 'A1', 'A2', 'A1_INT_A2', 'A1_UNI_A2',
        'A1_INT_A2__REL__A1_UNI_A2', 'A1_INT_A2__REL__A1',
        'A1_INT_A2__REL__A2', 'CM1', 'CM2', 'CM1_INT_CM2', 'CM1_UNI_CM2',
        'CM1_INT_CM2__REL__CM1_UNI_CM2', 'CM1_INT_CM2__REL__CM1',
        'CM1_INT_CM2__REL__CM2'
    ]
    writer.writerow(header)

    for year in reversed(sorted(allYears)):
        pc_c_int_pc_d = pc_c[year].intersection(pc_d[year])
        pc_c_uni_pc_d = pc_c[year].union(pc_d[year])
Ejemplo n.º 9
0
#         pass
    
    return name



# conferences = ['icse', 'icsm', 'wcre', 'csmr', 'msr', 'gpce', 'fase', 'icpc', 'fse', 'scam', 'ase']
# conferences = ['msr']
conferences = sys.argv[1:]


unknowns = set()

for conference in conferences:
    g = open(os.path.join(dataPath, 'bht2csv', 'misc', '%s_papers_2013n.csv' % conference), 'wb')
    writer = UnicodeWriter(g)

    f1 = open(os.path.join(dataPath, 'bht2csv', 'misc', '%s_papers_2013.csv' % conference), 'rb')
    reader1 = UnicodeReader(f1)    
    for row in reader1:
        year = row[0]
        authors = ','.join([normaliseName(name) for name in row[1].split(',') if len(normaliseName(name))])
        title = row[2]
        writer.writerow([year, authors, title, '', '', '', ''])
    g.close()

print
soFarSoGood = set()

# "Paulo R. F. Cunha": "Paulo Cunha"
# "Neil Maiden": "Neil A. M. Maiden"
Ejemplo n.º 10
0
def main():

    data = "../resources/SOusers-Mar13.csv"  # File containing SO user dump
    results = "../resources/features3.csv"  # File where features will be stored
    picPath = "../resources/SOpictures/"  # Directory where pictures will be downloaded

    fr = open(os.path.join(data), 'rb')
    fw = open(os.path.join(results), 'ab')

    if _RANDOM:
        reader = RandomReader(fr)
    else:
        reader = UnicodeReader(fr)

    writer = UnicodeWriter(fw)

    queue = Queue()
    if _FACE:
        faceDetector = FaceDetector()

    threads = []
    SOhashes = {}  # Dictionary of user's hashes

    # Use multiple threads to download and get information
    for i in xrange(10):
        threads.append(Downloader(queue))
        threads[-1].start()

    idx = 0
    size = 4500  # Number of subjects

    for row in reader:
        if idx < size:
            so_uid = row[0]
            so_hash = row[2]
            if (not (SOhashes.has_key(so_hash))):
                SOhashes[so_hash] = so_uid
                if (not isDefaultGravatarPic(so_hash)):
                    data = [so_uid]
                    if _VISUAL_FEATURES:

                        # Download picture
                        filepath = os.path.join('%s%d.jpg' %
                                                (picPath, int(so_uid)))
                        if not os.path.isfile(filepath):
                            queue.put(
                                ('http://www.gravatar.com/avatar/%s' % so_hash,
                                 filepath))
                            time.sleep(2)

                        # Load picture
                        pic = picUtils.loadPicture(filepath)

                        if _FACE:
                            if faceDetector.isFrontFace(
                                    pic) or faceDetector.isProfileFace(pic):
                                data.append(str(True))
                            else:
                                data.append(str(False))

                        if _MOST_COMMON_COLORS:
                            _, f1, _, f2 = picUtils.mostCommonColor(pic)
                            data.append(str(f1 + f2))

                        if _NBCOLORS:
                            data.append(str(picUtils.getNbOfColors(pic)))

                        if _FARTHEST_NEIGHBOR:
                            F1 = picUtils.farthestNeighborMetric(pic, 10)
                            F2 = picUtils.farthestNeighborMetric(pic, 200)
                            data.append(str(F1))
                            data.append(str(F2))
                            if F1 != 0:
                                data.append(str(F2 / F1))
                            else:
                                data.append('?')

                        if _AVERAGE_SATURATION:
                            data.append(str(picUtils.avgSaturation(pic)))

                        if _THRESHOLD_BRIGHTNESS:
                            data.append(str(picUtils.threBrightness(pic, 0.2)))

                    if _GOOGLE:
                        gi = GoogleImage('http://www.gravatar.com/avatar/%s' %
                                         so_hash)
                        bestGuess = gi.getBestGuess()
                        if bestGuess:
                            bestGuess = bestGuess.encode('utf8')
                            data.append(bestGuess)
                            if _WIKIPEDIA:
                                gs = GoogleSearch("%s site:en.wikipedia.org" %
                                                  bestGuess)
                                wikiTitlePage = gs.getWikipediaTitlePage()
                                if wikiTitlePage:
                                    wiki = Wikipedia(wikiTitlePage)
                                    wiki.categoryGraph(4)
                                    nbCats = 10
                                    i = 0
                                    cats = wiki.sortGraphByDegree()
                                    while i < nbCats and i < len(cats):
                                        data.append(str(cats[i]))
                                        i += 1

                    # Write all information collected in the csv file
                    try:
                        print data
                        writer.writerow(data)
                        idx += 1
                    except:
                        print "Error with data"
        else:
            break
    fr.close()
    fw.close()

    # If here, download finished. Stop threads
    for i in xrange(10):
        queue.put((None, None))
Ejemplo n.º 11
0
def buildSampleSet(inputFile, sampleFile):
    
    
    f = open(os.path.join(inputFile), "rb")
    f1 = open(os.path.join(sampleFile), "wb")

    reader = RandomReader(f)
    writer = UnicodeWriter(f1)
    
    nbRows = 0
    categories = []
    countPages = []
    for row in reader:
        nbRows += 1
        for cat in row[15:25]:
            if cat != "?":
                if not cat in categories:
                    categories.append(cat)
                    countPages.append(0)    
    data = []
    
    for row in reader:
        line = []
        for d in row[0:15]:
            line.append(d)
        for ind, cat in enumerate(categories):
            if cat in row[15:25]:
                countPages[ind] += 1
                line.append(str(1))
            else:
                line.append(str('?'))
        data.append(line)
    
    i = 0
    filteredCategories = []
    for cat, cpt in zip(categories, countPages):
        if cpt < 6: # Filter categories and keep only those which have more than 6 subjects in it
            ind = 15 + i
            for d in data:
                d.pop(ind)
        else:
            filteredCategories.append(cat)
            i += 1        
            
    # Header
    header = ["uid", "cl1", "cl2", "cl3", "cl4", "cl5", "face",
              "fCols", "nbCols", "f1", "f2", "f3", "s",
              "b", "bestGuess"]
    domain = ["c", "d", "d", "d", "d", "d", "d", "c", "c", "c", "c", "c", "c", "c", "string"]
    attribut = ["m", "c", "m", "m", "m", "m"]
    
    for cat in filteredCategories:
        header.append(cat)
        domain.append("d")
        
    writer.writerow(header)
    writer.writerow(domain)
    writer.writerow(attribut)
    i = 0
    for row in data:
        writer.writerow(row)
        i += 1
    f.close()
    f1.close()
Ejemplo n.º 12
0
    'icse', 'icsm', 'wcre', 'csmr', 'msr', 'gpce', 'fase', 'iwpc', 'sigsoft',
    'scam', 'kbse'
]
conferences = ['msr']

for conference in conferences:

    print conference

    f = open(os.path.join(outPath, 'selection', '%s_papers.csv' % conference),
             'rb')
    reader = UnicodeReader(f)

    g = open(os.path.join(outPath, 'research-papers', '%s.csv' % conference),
             'wb')
    writer = UnicodeWriter(g)

    for row in reader:
        year = row[0]
        authors = []
        for name in row[1].split(','):
            try:
                cleanName = dblpNames[unidecode(name.strip()).strip()]
            except:
                cleanName = unidecode(name.strip()).strip()
            if len(cleanName):
                authors.append(cleanName)
        authors = ','.join(authors)
        writer.writerow([year, authors] + row[2:])

    g.close()
Ejemplo n.º 13
0
from unidecode import unidecode
from nameMap import nameMap

dataPath = os.path.abspath("../../../data")

# conferences = ['icse', 'icsm', 'wcre', 'csmr', 'msr', 'gpce', 'fase', 'icpc', 'fse', 'scam', 'ase']
# conferences = ['msr']
conferences = sys.argv[1:]

unknowns = set()

for conference in conferences:
    g = open(
        os.path.join(dataPath, 'bht2csv', 'misc',
                     '%s_papers_2013n.csv' % conference), 'wb')
    writer = UnicodeWriter(g)

    f1 = open(
        os.path.join(dataPath, 'bht2csv', 'misc',
                     '%s_papers_2013.csv' % conference), 'rb')
    reader1 = UnicodeReader(f1)
    for row in reader1:
        year = row[0]
        authors = ','.join([
            normaliseName(name) for name in row[1].split(',')
            if len(normaliseName(name))
        ])
        title = row[2]
        writer.writerow([year, authors, title, '', '', '', ''])
    g.close()
Ejemplo n.º 14
0
def buildSampleSet(inputFile, sampleFile):

    f = open(os.path.join(inputFile), "rb")
    f1 = open(os.path.join(sampleFile), "wb")

    reader = RandomReader(f)
    writer = UnicodeWriter(f1)

    nbRows = 0
    categories = []
    countPages = []
    for row in reader:
        nbRows += 1
        for cat in row[15:25]:
            if cat != "?":
                if not cat in categories:
                    categories.append(cat)
                    countPages.append(0)
    data = []

    for row in reader:
        line = []
        for d in row[0:15]:
            line.append(d)
        for ind, cat in enumerate(categories):
            if cat in row[15:25]:
                countPages[ind] += 1
                line.append(str(1))
            else:
                line.append(str('?'))
        data.append(line)

    i = 0
    filteredCategories = []
    for cat, cpt in zip(categories, countPages):
        if cpt < 6:  # Filter categories and keep only those which have more than 6 subjects in it
            ind = 15 + i
            for d in data:
                d.pop(ind)
        else:
            filteredCategories.append(cat)
            i += 1

    # Header
    header = [
        "uid", "cl1", "cl2", "cl3", "cl4", "cl5", "face", "fCols", "nbCols",
        "f1", "f2", "f3", "s", "b", "bestGuess"
    ]
    domain = [
        "c", "d", "d", "d", "d", "d", "d", "c", "c", "c", "c", "c", "c", "c",
        "string"
    ]
    attribut = ["m", "c", "m", "m", "m", "m"]

    for cat in filteredCategories:
        header.append(cat)
        domain.append("d")

    writer.writerow(header)
    writer.writerow(domain)
    writer.writerow(attribut)
    i = 0
    for row in data:
        writer.writerow(row)
        i += 1
    f.close()
    f1.close()
Ejemplo n.º 15
0
    year = int(row[0])
    authorsStr = row[1]
    authors = [cleanName(a.strip(), directLookup, reverseLookup) for a in authorsStr.split(',')]
    authorsSet.update(authors)
f.close()

pcSet = set()

f = open(os.path.abspath(path_PC), "rb")
reader = UnicodeReader(f)
for row in reader:
    year = int(row[0])
    track = row[1]
    if track == 'main':
        name = cleanName(row[2], directLookup, reverseLookup)
        pcSet.add(name)


g = open(os.path.abspath("../../../data/temp/%s-authors.csv" % conferenceName), "wb")
writer = UnicodeWriter(g)
for author in sorted(authorsSet):
    writer.writerow([author, 'author'])
for author in sorted(pcSet):
    writer.writerow([author, 'pc'])
g.close()





Ejemplo n.º 16
0
    print c,d

    pc_c = metrics[c].pcPerYear
    pc_d = metrics[d].pcPerYear
    
    a_c = metrics[c].authorsPerYear
    a_d = metrics[d].authorsPerYear
    
    cm_c = metrics[c].membersPerYear
    cm_d = metrics[d].membersPerYear

    allYears = set(cm_c.keys()).intersection(cm_d.keys())

    outPath = os.path.join(metricsPath, 'pairwise', '%s_%s.csv' % (c,d))
    f = open(outPath, 'wb')
    writer = UnicodeWriter(f)
    header = ['YEAR',
              'PC1',
              'PC2', 
              'PC1_INT_PC2',
              'PC1_UNI_PC2',
              'PC1_INT_PC2__REL__PC1_UNI_PC2',
              'PC1_INT_PC2__REL__PC1',
              'PC1_INT_PC2__REL__PC2',
              'A1',
              'A2', 
              'A1_INT_A2',
              'A1_UNI_A2',
              'A1_INT_A2__REL__A1_UNI_A2',
              'A1_INT_A2__REL__A1',
              'A1_INT_A2__REL__A2',
Ejemplo n.º 17
0
    try:
        aid = reverseLookup[name]
        name = directLookup[aid]
    except:
        pass
    
    return name



conferences = ['icse', 'icsm', 'wcre', 'csmr', 'msr', 'gpce', 'fase', 'icpc', 'fse', 'scam', 'ase']


for conference in conferences:
    g = open(os.path.join(dataPath, 'normalised-pc', '%s.csv' % conference), 'wb')
    writer = UnicodeWriter(g)

    f1 = open(os.path.join(dataPath, 'pc', '%s.csv' % conference), 'rb')
    reader1 = UnicodeReader(f1)    
    for row in reader1:
        year = row[0]
        track = row[1]
        if track == 'main':
            pc = ','.join([normaliseName(name) for name in row[2].split(',') if len(normaliseName(name))])
            writer.writerow([year, track, pc])
    g.close()

exit()

#conferences = ['ase']
Ejemplo n.º 18
0
#from unidecode import unidecode
#from nameMap import nameMap
from nameMagic import normaliseName, directLookup, reverseLookup



dataPath = os.path.abspath("../../../data")

#print normaliseName(u'Liz Burd')
#exit()

conferences = ['icse', 'icsm', 'wcre', 'csmr', 'msr', 'gpce', 'fase', 'icpc', 'fse', 'scam', 'ase']

for conference in conferences:
    g = open(os.path.join(dataPath, 'normalised-pc', '%s.csv' % conference), 'wb')
    writer = UnicodeWriter(g)

    f1 = open(os.path.join(dataPath, 'pc', '%s.csv' % conference), 'rb')
    reader1 = UnicodeReader(f1)    
    for row in reader1:
        year = row[0]
        track = row[1]
        if track == 'main':
            pc = ','.join([normaliseName(name) for name in row[2].split(',') if len(normaliseName(name))])
            writer.writerow([year, track, pc])
    g.close()

exit()

#conferences = ['ase']
Ejemplo n.º 19
0
def main():
    
    data = "../resources/SOusers-Mar13.csv" # File containing SO user dump
    results = "../resources/features3.csv" # File where features will be stored
    picPath = "../resources/SOpictures/" # Directory where pictures will be downloaded
    
    fr = open(os.path.join(data), 'rb')
    fw = open(os.path.join(results), 'ab')
    
    if _RANDOM:
        reader = RandomReader(fr)
    else:
        reader = UnicodeReader(fr)
        
    
    writer = UnicodeWriter(fw)
    
    queue = Queue()
    if _FACE:
        faceDetector = FaceDetector()
    
    threads = []
    SOhashes = {} # Dictionary of user's hashes
        
    # Use multiple threads to download and get information
    for i in xrange(10):
        threads.append(Downloader(queue))
        threads[-1].start()
        
    
    idx = 0
    size = 4500 # Number of subjects
    
    for row in reader:
        if idx < size:
            so_uid = row[0]            
            so_hash = row[2]
            if(not (SOhashes.has_key(so_hash))):
                SOhashes[so_hash] = so_uid
                if(not isDefaultGravatarPic(so_hash)):
                    data = [so_uid]
                    if _VISUAL_FEATURES:
                          
                        # Download picture
                        filepath = os.path.join('%s%d.jpg' % (picPath,int(so_uid)))
                        if not os.path.isfile(filepath):
                            queue.put(('http://www.gravatar.com/avatar/%s' % so_hash, filepath))
                            time.sleep(2)
                              
                        # Load picture
                        pic = picUtils.loadPicture(filepath)
                      
                        if _FACE:
                            if faceDetector.isFrontFace(pic) or faceDetector.isProfileFace(pic):
                                data.append(str(True))
                            else:
                                data.append(str(False))
                          
                        if _MOST_COMMON_COLORS:
                            _, f1, _, f2 = picUtils.mostCommonColor(pic)
                            data.append(str(f1 + f2))
                              
                        if _NBCOLORS:
                            data.append(str(picUtils.getNbOfColors(pic)))
                              
                        if _FARTHEST_NEIGHBOR:
                            F1 = picUtils.farthestNeighborMetric(pic, 10)
                            F2 = picUtils.farthestNeighborMetric(pic, 200)
                            data.append(str(F1))
                            data.append(str(F2))
                            if F1 != 0:
                                data.append(str(F2/F1))
                            else:
                                data.append('?')
                         
                        if _AVERAGE_SATURATION:
                            data.append(str(picUtils.avgSaturation(pic)))
                          
                        if _THRESHOLD_BRIGHTNESS:
                            data.append(str(picUtils.threBrightness(pic, 0.2)))
                          
                    if _GOOGLE:
                        gi = GoogleImage('http://www.gravatar.com/avatar/%s' % so_hash)
                        bestGuess = gi.getBestGuess()
                        if bestGuess:
                            bestGuess = bestGuess.encode('utf8')
                            data.append(bestGuess)
                            if _WIKIPEDIA:
                                gs = GoogleSearch("%s site:en.wikipedia.org" % bestGuess)
                                wikiTitlePage = gs.getWikipediaTitlePage()
                                if wikiTitlePage:
                                    wiki = Wikipedia(wikiTitlePage)
                                    wiki.categoryGraph(4)
                                    nbCats = 10
                                    i = 0
                                    cats = wiki.sortGraphByDegree()
                                    while i<nbCats and i < len(cats):
                                        data.append(str(cats[i]))
                                        i += 1
                     
                      
                    # Write all information collected in the csv file
                    try:
                        print data
                        writer.writerow(data)
                        idx += 1
                    except:
                        print "Error with data"
        else:
            break
    fr.close()
    fw.close()
    
    # If here, download finished. Stop threads
    for i in xrange(10):
        queue.put((None, None))
Ejemplo n.º 20
0
    return name



conferences = ['icse', 'icsm', 'wcre', 'csmr', 'msr', 'gpce', 'fase', 'icpc', 'fse', 'scam', 'ase']
# conferences = ['msr']
# conferences = sys.argv[1:]

for conference in conferences:
    
    print conference
    
    f = open(os.path.join(dataPath, 'bht2csv', '%s_papers.csv' % conference), 'rb')
    reader = UnicodeReader(f)
    
    g = open(os.path.join(dataPath, 'normalised-papers', '%s.csv' % conference), 'wb')
    writer = UnicodeWriter(g)


    for row in reader:
        year = row[0]
        authors = []
        for name in row[1].split(','):
            cleanName = normaliseName(name)
            if len(cleanName):
                authors.append(cleanName)
        authors = ','.join(authors)
        writer.writerow([year, authors] + row[2:])

    g.close()
Ejemplo n.º 21
0
	# First convert unicode chars
    name = unidecode(row[2])
    # Then pass through filter
    try:
        name = nameMap[name]
    except:
        pass
    # Remember name if not in DBLP
    try:
        aid = reverseLookup[name]
    except:
        unknowns.add(name)

# Start name matching between conference PC and DBLP aliases
g = open(os.path.abspath("../../../data/temp/map_%s.csv" % conference), "wb")
writer = UnicodeWriter(g)

soFarSoGood = set()

# "Paulo R. F. Cunha": "Paulo Cunha"
# "Neil Maiden": "Neil A. M. Maiden"
# Strip middle initials, exact match on all other name parts
uselessData = MyDict()
# for each name in the DBLP data
for key in reverseLookup.keys():
	# record a version of the name without initials
    s = " ".join([p.lower() for p in key.split() if len(p) > 1 and p.find('.') == -1])
    uselessData[key] = s

# then for each of the unknowns
for name in sorted(unknowns):