def publication_years(): plt.figure(num=None, figsize=(8, 4), dpi=80, facecolor='w', edgecolor='k') publicationYears = list(simpleDoc.publicationDatetime().year for simpleDoc in SimpleDoc.getallBetween(None, None)) histDisc(plt, publicationYears, width = 0.5) # plt.savefig(figurePath("publication_years.png")) plt.tight_layout() plt.show()
def tweetsBySpecificUserCorrelations(): docs = SimpleDoc.getallBetween((2012, 6), (2012, 8)) pairs = [] for doc in docs: numTweets = len(filter(lambda tweet: tweet.username=="ATP_CME" ,doc.tweets)) citations = doc.averageCitations() pairs.append([numTweets, citations]) x, y = zip(*pairs) print allCorrelations(x, y) plt.scatter(x, y) plt.show()
(("pmcViews", ()), (2011, 6), "PMC views"), (("maxCitations", ()), (2009, 3), "Citations") ] attributeNames = map(lambda x: x[0][0], attributeList) attributePrintNames = map(lambda x: x[2], attributeList) calls = map(lambda x: x[0], attributeList) stats = [] for ind, attr in zip(range(0, len(attributeList)), attributeList): call = attr[0] lowerBound = attr[1] attName = attr[0][0] valuesForMetric = filter(lambda x: x != None, map(lambda doc: applyCall(doc, call), SimpleDoc.getallBetween(lowerBound, None) )) minV, maxV, meanV, std = min(valuesForMetric), max(valuesForMetric), np.mean(valuesForMetric), np.std(valuesForMetric) stats.append((attName, call, meanV, std, len(valuesForMetric))) print attName + "\t" + "\t".join(map(lambda x: str(x), [minV, maxV, meanV, std])) statValues = [] for stat in stats: name = stat[0] call = stat[1] mean = stat[2] std = stat[3] numValues = stat[4]
# expertCategories = ['Medicine', 'Health' ] wordExperts = getWordExperts(expertWords) # patrickExperts = getPatrickExperts(expertCategories) """bioDocs = minimizedDocs( filter( lambda doc: doc.mendeleyDisciplines != None and 'Biological Sciences' in doc.mendeleyDisciplines, SimpleDoc.getallBetween((2012,6), (2012,8)) ), metrics )""" docs = minimizedDocs( SimpleDoc.getallBetween((2012,6), (2012,8)), metrics ) usersInTimewindow = set((usr for doc in docs for usr in doc[0])) totalNumTweets = sum((1 for doc in docs for u in doc[0])) """f = open("baselines", "w") for numTweets in range(100, totalNumTweets, 100): print str(numTweets) + " / " + str(totalNumTweets) baseline = getBaseline(docs, metricNames, numTweets) f.write(json.dumps( { "num-tweets" : numTweets, "baseline" : baseline } ) + "\n") f.flush() f.close()"""
import json from main.util.db import openDb from scipy import stats from main.util.common import SimpleDoc, powerset, Log import math import itertools expertTopics = list(map(lambda s: s.strip(), open("data/expert_topics", "r"))) l = Log(filename="foo", verbose=True) docs = map(lambda doc: [map(lambda tweet: tweet.username, doc.tweets), map(lambda metric: metric[1](doc), metrics)], SimpleDoc.getallBetween((2012,6), (2012,8))) baseline = { } for ind, metricName in zip(range(0, len(metricNames)), metricNames): pairs = [] for doc in docs: numTweets = len(doc[0]) metricScore = doc[1][ind] pairs.append([numTweets, metricScore]) x, y = zip(*pairs) s, p = stats.spearmanr(x, y) baseline[metricName] = s count = 0 count2 = 0 for ind, metricName in zip(range(0, len(metricNames)), metricNames): pairs = []
def canBeEncoded(text): try: str(text) return True except UnicodeEncodeError: return False def tweetsBetweenDay(documents, lowerBound, upperBound): return [[tweet.text, tweet.timestamp, tweet.username, doc.doi, doc.title, doc.publicationTimestamp] for doc in documents for tweet in doc.tweets if ((lowerBound*60*60*24) <= (tweet.timestamp - doc.publicationTimestamp) <= (upperBound*60*60*24)) and canBeEncoded(tweet.text) and canBeEncoded(doc.title) ] relevantDocuments = SimpleDoc.getallBetween((2012, 6), (2012, 8)) tweets = [] tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 0, 1), 111)) tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 1, 3), 111)) tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 3, 5), 111)) tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 7, 30), 333)) tweets.extend(random.sample(tweetsBetweenDay(relevantDocuments, 100, 300), 333)) tweetTexts = map(lambda tweetdata: "\t".join([str(tweetdata[0]), str(tweetdata[1]), tweetdata[2], tweetdata[3], tweetdata[4], str(tweetdata[5])]), tweets) random.shuffle(tweetTexts) f = open("tweetTexts_1.txt", "w") for text in tweetTexts[0:333]: f.write(text.replace("\n", " ").replace("\"", "").replace("'", "") + "\n") f.close()
valuesForMetric = filter(lambda x: x != None, map(lambda doc: applyCall(doc, call), SimpleDoc.getallBetween(lowerBound, None) )) minV, maxV, meanV, std = min(valuesForMetric), max(valuesForMetric), np.mean(valuesForMetric), np.std(valuesForMetric) print attName + "\t" + "\t".join(map(lambda x: str(x), [minV, maxV, meanV, std])) """ cat = "Biological Sciences" """consideredDocs = filter( lambda doc: doc.mendeleyDisciplines != None and cat in doc.mendeleyDisciplines, SimpleDoc.getallBetween((2012,6), (2012,8)) )""" consideredDocs = SimpleDoc.getallBetween((2012,6), (2012,8)) print len(consideredDocs) matrix = getAttributeValueMatrix(consideredDocs, calls) corrs = correlationBetweenEverything(matrix, attributeNames) """f = open("foo", "w") for corr in corrs: f.write(corr.toJson() + "\n") f.close()""" # corrs = CorrelationItem.fromFile("stuff/pairwise_corr_2012-6_2012-8.json") f = open("foo", "w") m = [] for a1 in attributeNames: row = []