def parse_dblp_old(name, paper_type): f = open(os.path.join(dataPath, 'web2csv', paper_type, '%s.csv'%name), "wb") writer = UnicodeWriter(f) conf_path = os.path.join(dataPath, 'web', paper_type, name) for file_name in reversed(os.listdir(conf_path)): file_path = os.path.join(conf_path, file_name) print "**** %s ****" % file_path tree = ET.parse(file_path) root = tree.getroot() headers = root.findall('h2') lists = root.findall('ul') assert len(headers) == len(lists) for header, lst in zip(headers, lists): topic = header.text.replace("\n", "") for item in lst.findall("li"): try: # year, authors, title, pages, pageCount = format_dblp_xml(base_uri + item.attrib["id"] + ".xml") year, authors, title, pages, page_count, doi = format_dblp_xml(get_link_uri(item)) except urllib2.HTTPError, e: if e.code == 429: print "SLEEPING FOR 1 min" time.sleep(60) year, authors, title, pages, page_count, doi = format_dblp_xml(get_link_uri(item)) # year, authors, title, pages, page_count = format_dblp_xml(base_uri + item.attrib["id"] + ".xml") else: raise writer.writerow([year, authors, title, doi, pages, page_count, topic, ""])
def parse_dblp_new(name, volumes, paper_type, base_uri=JOURNAL_BASE_URI, f_name=None): if f_name is None: f_name = name f = open(os.path.join(dataPath, 'web2csv', paper_type, '%s.csv' % f_name), "wb") writer = UnicodeWriter(f) for volume in volumes: uri = "%s/%s/%s%d.html" % (base_uri, name, name, volume) print(uri) resp = requests.get(uri) if resp.status_code != 200: print(resp.headers) print("Status code: %d for %s%d.\nVerify URI: %s" % (resp.status_code, name, volume, uri)) return soup = Soup(resp.content, "html").html current = soup.find("body").find("h2").parent topic = get_text(current) while True: current = current.find_next_sibling() if current.name == "ul": for li in current.find_all("li", recursive=False): xml_uri = "%s/%s.xml" % (JOURNAL_XML_BASE_URI, li.attrs['id']) try: year, authors, title, pages, page_count, doi = format_dblp_xml(xml_uri) except urllib2.HTTPError, e: if e.code == 429: print "SLEEPING FOR 1 min" time.sleep(60) year, authors, title, pages, page_count, doi = format_dblp_xml(xml_uri) else: raise writer.writerow([year, authors, title, doi, pages, page_count, topic, ""]) elif current.name == "header": topic = get_text(current) if current.name == "div":break
def scatterPlot(outPath, metric1, metric2, k1=None, k2=None): f = open(outPath, 'wb') writer = UnicodeWriter(f) writer.writerow(['year', '%s%d'%(metric1,k1), '%s%d'%(metric2,k2), 'conference']) for conferenceName in conferences: for year in reversed(sorted(set(metrics[conferenceName].getMetric(metric1,k1).keys()).intersection(set(metrics[conferenceName].getMetric(metric2,k2).keys())))): x = metrics[conferenceName].getMetric(metric1,k1)[year] y = metrics[conferenceName].getMetric(metric2,k2)[year] row = [year, x, y, conferenceName] rowStr = [str(item) for item in row] writer.writerow(rowStr) f.close()
def parse_csv(conf): f = open(os.path.join(dataPath, 'web2csv', 'conferences', '%s_pc.csv'%conf), "wb") writer = UnicodeWriter(f) folder_path = os.path.join(dataPath, 'web_pc', 'conferences', conf) for file_name in reversed(os.listdir(folder_path)): file_path = os.path.join(folder_path, file_name) print "**** %s ****" % file_path year = file_name.split("_")[1].split(".")[0] with open(file_path, 'r') as csvfile: reader = UnicodeReader(csvfile) for row in reader: writer.writerow([year, "main", row[0].strip()] + [cell.strip() for cell in row[1].strip().split(",")]) f.close()
def tabulate2CSV(outPath, metric, k=None, datatype='float'): allYears = set() for conferenceName in conferences: allYears.update(metrics[conferenceName].getMetric(metric, k).keys()) f = open(outPath, 'wb') writer = UnicodeWriter(f) header = ['year'] + [c.upper() for c in conferences] headerStr = [str(item) for item in header] writer.writerow(headerStr) for year in reversed(sorted(allYears)): row = [year] for conferenceName in conferences: try: if datatype == 'float': row.append( '%.03f' % metrics[conferenceName].getMetric(metric, k)[year]) elif datatype == 'int': row.append(metrics[conferenceName].getMetric(metric, k)[year]) except: row.append('') rowStr = [str(item) for item in row] writer.writerow(rowStr) f.close()
def tabulate2CSV(outPath, metric, k=None, datatype='float'): allYears = set() for conferenceName in conferences: allYears.update(metrics[conferenceName].getMetric(metric,k).keys()) f = open(outPath, 'wb') writer = UnicodeWriter(f) header = ['year'] + [c.upper() for c in conferences] headerStr = [str(item) for item in header] writer.writerow(headerStr) for year in reversed(sorted(allYears)): row = [year] for conferenceName in conferences: try: if datatype == 'float': row.append('%.03f' % metrics[conferenceName].getMetric(metric,k)[year]) elif datatype == 'int': row.append(metrics[conferenceName].getMetric(metric,k)[year]) except: row.append('') rowStr = [str(item) for item in row] writer.writerow(rowStr) f.close()
def scatterPlot(outPath, metric1, metric2, k1=None, k2=None): f = open(outPath, 'wb') writer = UnicodeWriter(f) writer.writerow( ['year', '%s%d' % (metric1, k1), '%s%d' % (metric2, k2), 'conference']) for conferenceName in conferences: for year in reversed( sorted( set(metrics[conferenceName].getMetric( metric1, k1).keys()).intersection( set(metrics[conferenceName].getMetric( metric2, k2).keys())))): x = metrics[conferenceName].getMetric(metric1, k1)[year] y = metrics[conferenceName].getMetric(metric2, k2)[year] row = [year, x, y, conferenceName] rowStr = [str(item) for item in row] writer.writerow(rowStr) f.close()
print c, d pc_c = metrics[c].pcPerYear pc_d = metrics[d].pcPerYear a_c = metrics[c].authorsPerYear a_d = metrics[d].authorsPerYear cm_c = metrics[c].membersPerYear cm_d = metrics[d].membersPerYear allYears = set(cm_c.keys()).intersection(cm_d.keys()) outPath = os.path.join(metricsPath, 'pairwise', '%s_%s.csv' % (c, d)) f = open(outPath, 'wb') writer = UnicodeWriter(f) header = [ 'YEAR', 'PC1', 'PC2', 'PC1_INT_PC2', 'PC1_UNI_PC2', 'PC1_INT_PC2__REL__PC1_UNI_PC2', 'PC1_INT_PC2__REL__PC1', 'PC1_INT_PC2__REL__PC2', 'A1', 'A2', 'A1_INT_A2', 'A1_UNI_A2', 'A1_INT_A2__REL__A1_UNI_A2', 'A1_INT_A2__REL__A1', 'A1_INT_A2__REL__A2', 'CM1', 'CM2', 'CM1_INT_CM2', 'CM1_UNI_CM2', 'CM1_INT_CM2__REL__CM1_UNI_CM2', 'CM1_INT_CM2__REL__CM1', 'CM1_INT_CM2__REL__CM2' ] writer.writerow(header) for year in reversed(sorted(allYears)): pc_c_int_pc_d = pc_c[year].intersection(pc_d[year]) pc_c_uni_pc_d = pc_c[year].union(pc_d[year])
# pass return name # conferences = ['icse', 'icsm', 'wcre', 'csmr', 'msr', 'gpce', 'fase', 'icpc', 'fse', 'scam', 'ase'] # conferences = ['msr'] conferences = sys.argv[1:] unknowns = set() for conference in conferences: g = open(os.path.join(dataPath, 'bht2csv', 'misc', '%s_papers_2013n.csv' % conference), 'wb') writer = UnicodeWriter(g) f1 = open(os.path.join(dataPath, 'bht2csv', 'misc', '%s_papers_2013.csv' % conference), 'rb') reader1 = UnicodeReader(f1) for row in reader1: year = row[0] authors = ','.join([normaliseName(name) for name in row[1].split(',') if len(normaliseName(name))]) title = row[2] writer.writerow([year, authors, title, '', '', '', '']) g.close() print soFarSoGood = set() # "Paulo R. F. Cunha": "Paulo Cunha" # "Neil Maiden": "Neil A. M. Maiden"
def main(): data = "../resources/SOusers-Mar13.csv" # File containing SO user dump results = "../resources/features3.csv" # File where features will be stored picPath = "../resources/SOpictures/" # Directory where pictures will be downloaded fr = open(os.path.join(data), 'rb') fw = open(os.path.join(results), 'ab') if _RANDOM: reader = RandomReader(fr) else: reader = UnicodeReader(fr) writer = UnicodeWriter(fw) queue = Queue() if _FACE: faceDetector = FaceDetector() threads = [] SOhashes = {} # Dictionary of user's hashes # Use multiple threads to download and get information for i in xrange(10): threads.append(Downloader(queue)) threads[-1].start() idx = 0 size = 4500 # Number of subjects for row in reader: if idx < size: so_uid = row[0] so_hash = row[2] if (not (SOhashes.has_key(so_hash))): SOhashes[so_hash] = so_uid if (not isDefaultGravatarPic(so_hash)): data = [so_uid] if _VISUAL_FEATURES: # Download picture filepath = os.path.join('%s%d.jpg' % (picPath, int(so_uid))) if not os.path.isfile(filepath): queue.put( ('http://www.gravatar.com/avatar/%s' % so_hash, filepath)) time.sleep(2) # Load picture pic = picUtils.loadPicture(filepath) if _FACE: if faceDetector.isFrontFace( pic) or faceDetector.isProfileFace(pic): data.append(str(True)) else: data.append(str(False)) if _MOST_COMMON_COLORS: _, f1, _, f2 = picUtils.mostCommonColor(pic) data.append(str(f1 + f2)) if _NBCOLORS: data.append(str(picUtils.getNbOfColors(pic))) if _FARTHEST_NEIGHBOR: F1 = picUtils.farthestNeighborMetric(pic, 10) F2 = picUtils.farthestNeighborMetric(pic, 200) data.append(str(F1)) data.append(str(F2)) if F1 != 0: data.append(str(F2 / F1)) else: data.append('?') if _AVERAGE_SATURATION: data.append(str(picUtils.avgSaturation(pic))) if _THRESHOLD_BRIGHTNESS: data.append(str(picUtils.threBrightness(pic, 0.2))) if _GOOGLE: gi = GoogleImage('http://www.gravatar.com/avatar/%s' % so_hash) bestGuess = gi.getBestGuess() if bestGuess: bestGuess = bestGuess.encode('utf8') data.append(bestGuess) if _WIKIPEDIA: gs = GoogleSearch("%s site:en.wikipedia.org" % bestGuess) wikiTitlePage = gs.getWikipediaTitlePage() if wikiTitlePage: wiki = Wikipedia(wikiTitlePage) wiki.categoryGraph(4) nbCats = 10 i = 0 cats = wiki.sortGraphByDegree() while i < nbCats and i < len(cats): data.append(str(cats[i])) i += 1 # Write all information collected in the csv file try: print data writer.writerow(data) idx += 1 except: print "Error with data" else: break fr.close() fw.close() # If here, download finished. Stop threads for i in xrange(10): queue.put((None, None))
def buildSampleSet(inputFile, sampleFile): f = open(os.path.join(inputFile), "rb") f1 = open(os.path.join(sampleFile), "wb") reader = RandomReader(f) writer = UnicodeWriter(f1) nbRows = 0 categories = [] countPages = [] for row in reader: nbRows += 1 for cat in row[15:25]: if cat != "?": if not cat in categories: categories.append(cat) countPages.append(0) data = [] for row in reader: line = [] for d in row[0:15]: line.append(d) for ind, cat in enumerate(categories): if cat in row[15:25]: countPages[ind] += 1 line.append(str(1)) else: line.append(str('?')) data.append(line) i = 0 filteredCategories = [] for cat, cpt in zip(categories, countPages): if cpt < 6: # Filter categories and keep only those which have more than 6 subjects in it ind = 15 + i for d in data: d.pop(ind) else: filteredCategories.append(cat) i += 1 # Header header = ["uid", "cl1", "cl2", "cl3", "cl4", "cl5", "face", "fCols", "nbCols", "f1", "f2", "f3", "s", "b", "bestGuess"] domain = ["c", "d", "d", "d", "d", "d", "d", "c", "c", "c", "c", "c", "c", "c", "string"] attribut = ["m", "c", "m", "m", "m", "m"] for cat in filteredCategories: header.append(cat) domain.append("d") writer.writerow(header) writer.writerow(domain) writer.writerow(attribut) i = 0 for row in data: writer.writerow(row) i += 1 f.close() f1.close()
'icse', 'icsm', 'wcre', 'csmr', 'msr', 'gpce', 'fase', 'iwpc', 'sigsoft', 'scam', 'kbse' ] conferences = ['msr'] for conference in conferences: print conference f = open(os.path.join(outPath, 'selection', '%s_papers.csv' % conference), 'rb') reader = UnicodeReader(f) g = open(os.path.join(outPath, 'research-papers', '%s.csv' % conference), 'wb') writer = UnicodeWriter(g) for row in reader: year = row[0] authors = [] for name in row[1].split(','): try: cleanName = dblpNames[unidecode(name.strip()).strip()] except: cleanName = unidecode(name.strip()).strip() if len(cleanName): authors.append(cleanName) authors = ','.join(authors) writer.writerow([year, authors] + row[2:]) g.close()
from unidecode import unidecode from nameMap import nameMap dataPath = os.path.abspath("../../../data") # conferences = ['icse', 'icsm', 'wcre', 'csmr', 'msr', 'gpce', 'fase', 'icpc', 'fse', 'scam', 'ase'] # conferences = ['msr'] conferences = sys.argv[1:] unknowns = set() for conference in conferences: g = open( os.path.join(dataPath, 'bht2csv', 'misc', '%s_papers_2013n.csv' % conference), 'wb') writer = UnicodeWriter(g) f1 = open( os.path.join(dataPath, 'bht2csv', 'misc', '%s_papers_2013.csv' % conference), 'rb') reader1 = UnicodeReader(f1) for row in reader1: year = row[0] authors = ','.join([ normaliseName(name) for name in row[1].split(',') if len(normaliseName(name)) ]) title = row[2] writer.writerow([year, authors, title, '', '', '', '']) g.close()
def buildSampleSet(inputFile, sampleFile): f = open(os.path.join(inputFile), "rb") f1 = open(os.path.join(sampleFile), "wb") reader = RandomReader(f) writer = UnicodeWriter(f1) nbRows = 0 categories = [] countPages = [] for row in reader: nbRows += 1 for cat in row[15:25]: if cat != "?": if not cat in categories: categories.append(cat) countPages.append(0) data = [] for row in reader: line = [] for d in row[0:15]: line.append(d) for ind, cat in enumerate(categories): if cat in row[15:25]: countPages[ind] += 1 line.append(str(1)) else: line.append(str('?')) data.append(line) i = 0 filteredCategories = [] for cat, cpt in zip(categories, countPages): if cpt < 6: # Filter categories and keep only those which have more than 6 subjects in it ind = 15 + i for d in data: d.pop(ind) else: filteredCategories.append(cat) i += 1 # Header header = [ "uid", "cl1", "cl2", "cl3", "cl4", "cl5", "face", "fCols", "nbCols", "f1", "f2", "f3", "s", "b", "bestGuess" ] domain = [ "c", "d", "d", "d", "d", "d", "d", "c", "c", "c", "c", "c", "c", "c", "string" ] attribut = ["m", "c", "m", "m", "m", "m"] for cat in filteredCategories: header.append(cat) domain.append("d") writer.writerow(header) writer.writerow(domain) writer.writerow(attribut) i = 0 for row in data: writer.writerow(row) i += 1 f.close() f1.close()
year = int(row[0]) authorsStr = row[1] authors = [cleanName(a.strip(), directLookup, reverseLookup) for a in authorsStr.split(',')] authorsSet.update(authors) f.close() pcSet = set() f = open(os.path.abspath(path_PC), "rb") reader = UnicodeReader(f) for row in reader: year = int(row[0]) track = row[1] if track == 'main': name = cleanName(row[2], directLookup, reverseLookup) pcSet.add(name) g = open(os.path.abspath("../../../data/temp/%s-authors.csv" % conferenceName), "wb") writer = UnicodeWriter(g) for author in sorted(authorsSet): writer.writerow([author, 'author']) for author in sorted(pcSet): writer.writerow([author, 'pc']) g.close()
print c,d pc_c = metrics[c].pcPerYear pc_d = metrics[d].pcPerYear a_c = metrics[c].authorsPerYear a_d = metrics[d].authorsPerYear cm_c = metrics[c].membersPerYear cm_d = metrics[d].membersPerYear allYears = set(cm_c.keys()).intersection(cm_d.keys()) outPath = os.path.join(metricsPath, 'pairwise', '%s_%s.csv' % (c,d)) f = open(outPath, 'wb') writer = UnicodeWriter(f) header = ['YEAR', 'PC1', 'PC2', 'PC1_INT_PC2', 'PC1_UNI_PC2', 'PC1_INT_PC2__REL__PC1_UNI_PC2', 'PC1_INT_PC2__REL__PC1', 'PC1_INT_PC2__REL__PC2', 'A1', 'A2', 'A1_INT_A2', 'A1_UNI_A2', 'A1_INT_A2__REL__A1_UNI_A2', 'A1_INT_A2__REL__A1', 'A1_INT_A2__REL__A2',
try: aid = reverseLookup[name] name = directLookup[aid] except: pass return name conferences = ['icse', 'icsm', 'wcre', 'csmr', 'msr', 'gpce', 'fase', 'icpc', 'fse', 'scam', 'ase'] for conference in conferences: g = open(os.path.join(dataPath, 'normalised-pc', '%s.csv' % conference), 'wb') writer = UnicodeWriter(g) f1 = open(os.path.join(dataPath, 'pc', '%s.csv' % conference), 'rb') reader1 = UnicodeReader(f1) for row in reader1: year = row[0] track = row[1] if track == 'main': pc = ','.join([normaliseName(name) for name in row[2].split(',') if len(normaliseName(name))]) writer.writerow([year, track, pc]) g.close() exit() #conferences = ['ase']
#from unidecode import unidecode #from nameMap import nameMap from nameMagic import normaliseName, directLookup, reverseLookup dataPath = os.path.abspath("../../../data") #print normaliseName(u'Liz Burd') #exit() conferences = ['icse', 'icsm', 'wcre', 'csmr', 'msr', 'gpce', 'fase', 'icpc', 'fse', 'scam', 'ase'] for conference in conferences: g = open(os.path.join(dataPath, 'normalised-pc', '%s.csv' % conference), 'wb') writer = UnicodeWriter(g) f1 = open(os.path.join(dataPath, 'pc', '%s.csv' % conference), 'rb') reader1 = UnicodeReader(f1) for row in reader1: year = row[0] track = row[1] if track == 'main': pc = ','.join([normaliseName(name) for name in row[2].split(',') if len(normaliseName(name))]) writer.writerow([year, track, pc]) g.close() exit() #conferences = ['ase']
def main(): data = "../resources/SOusers-Mar13.csv" # File containing SO user dump results = "../resources/features3.csv" # File where features will be stored picPath = "../resources/SOpictures/" # Directory where pictures will be downloaded fr = open(os.path.join(data), 'rb') fw = open(os.path.join(results), 'ab') if _RANDOM: reader = RandomReader(fr) else: reader = UnicodeReader(fr) writer = UnicodeWriter(fw) queue = Queue() if _FACE: faceDetector = FaceDetector() threads = [] SOhashes = {} # Dictionary of user's hashes # Use multiple threads to download and get information for i in xrange(10): threads.append(Downloader(queue)) threads[-1].start() idx = 0 size = 4500 # Number of subjects for row in reader: if idx < size: so_uid = row[0] so_hash = row[2] if(not (SOhashes.has_key(so_hash))): SOhashes[so_hash] = so_uid if(not isDefaultGravatarPic(so_hash)): data = [so_uid] if _VISUAL_FEATURES: # Download picture filepath = os.path.join('%s%d.jpg' % (picPath,int(so_uid))) if not os.path.isfile(filepath): queue.put(('http://www.gravatar.com/avatar/%s' % so_hash, filepath)) time.sleep(2) # Load picture pic = picUtils.loadPicture(filepath) if _FACE: if faceDetector.isFrontFace(pic) or faceDetector.isProfileFace(pic): data.append(str(True)) else: data.append(str(False)) if _MOST_COMMON_COLORS: _, f1, _, f2 = picUtils.mostCommonColor(pic) data.append(str(f1 + f2)) if _NBCOLORS: data.append(str(picUtils.getNbOfColors(pic))) if _FARTHEST_NEIGHBOR: F1 = picUtils.farthestNeighborMetric(pic, 10) F2 = picUtils.farthestNeighborMetric(pic, 200) data.append(str(F1)) data.append(str(F2)) if F1 != 0: data.append(str(F2/F1)) else: data.append('?') if _AVERAGE_SATURATION: data.append(str(picUtils.avgSaturation(pic))) if _THRESHOLD_BRIGHTNESS: data.append(str(picUtils.threBrightness(pic, 0.2))) if _GOOGLE: gi = GoogleImage('http://www.gravatar.com/avatar/%s' % so_hash) bestGuess = gi.getBestGuess() if bestGuess: bestGuess = bestGuess.encode('utf8') data.append(bestGuess) if _WIKIPEDIA: gs = GoogleSearch("%s site:en.wikipedia.org" % bestGuess) wikiTitlePage = gs.getWikipediaTitlePage() if wikiTitlePage: wiki = Wikipedia(wikiTitlePage) wiki.categoryGraph(4) nbCats = 10 i = 0 cats = wiki.sortGraphByDegree() while i<nbCats and i < len(cats): data.append(str(cats[i])) i += 1 # Write all information collected in the csv file try: print data writer.writerow(data) idx += 1 except: print "Error with data" else: break fr.close() fw.close() # If here, download finished. Stop threads for i in xrange(10): queue.put((None, None))
return name conferences = ['icse', 'icsm', 'wcre', 'csmr', 'msr', 'gpce', 'fase', 'icpc', 'fse', 'scam', 'ase'] # conferences = ['msr'] # conferences = sys.argv[1:] for conference in conferences: print conference f = open(os.path.join(dataPath, 'bht2csv', '%s_papers.csv' % conference), 'rb') reader = UnicodeReader(f) g = open(os.path.join(dataPath, 'normalised-papers', '%s.csv' % conference), 'wb') writer = UnicodeWriter(g) for row in reader: year = row[0] authors = [] for name in row[1].split(','): cleanName = normaliseName(name) if len(cleanName): authors.append(cleanName) authors = ','.join(authors) writer.writerow([year, authors] + row[2:]) g.close()
# First convert unicode chars name = unidecode(row[2]) # Then pass through filter try: name = nameMap[name] except: pass # Remember name if not in DBLP try: aid = reverseLookup[name] except: unknowns.add(name) # Start name matching between conference PC and DBLP aliases g = open(os.path.abspath("../../../data/temp/map_%s.csv" % conference), "wb") writer = UnicodeWriter(g) soFarSoGood = set() # "Paulo R. F. Cunha": "Paulo Cunha" # "Neil Maiden": "Neil A. M. Maiden" # Strip middle initials, exact match on all other name parts uselessData = MyDict() # for each name in the DBLP data for key in reverseLookup.keys(): # record a version of the name without initials s = " ".join([p.lower() for p in key.split() if len(p) > 1 and p.find('.') == -1]) uselessData[key] = s # then for each of the unknowns for name in sorted(unknowns):