from main.util.db import openDb
import json
from scipy import stats
from main.util.common import SimpleDoc, powerset, Log
import math
import itertools
from main.metrics.get_experts import metricNames

db = openDb("stuff/localconnect.json")

l = Log(filename="foo", verbose=True)

topCategories = ['Biology', 'Medicine', 'Sports', 'Culture', 'Technology', 'Education', 'Health', 
    'Business', 'Belief', 'Humanities', 'Society', 'Life', 'Arts', 'Language', 'Law', 
    'History', 'Geography', 'Agriculture', 'Politics', 'Mathematics', 'Science', 
    'Nature', 'Environment', 'People', 'Chronology' ]


outcomes = { }

"""for cats in topCategories:
    l.log("analyze categories: " + repr(cats))

    cur = db.cursor()

    scores = { }
    for ind, metricName in zip(range(0, len(metricNames)), metricNames):
        pairs = []
        for doc in docs:
            numExpertTweets = sum((1 for usr in doc[0] if not usr in patrickExperts))
            metricScore = doc[1][ind]
import json
from main.util.db import openDb
from scipy import stats
from main.util.common import SimpleDoc, powerset, Log
import math
import itertools

expertTopics = list(map(lambda s: s.strip(), open("data/expert_topics", "r")))

l = Log(filename="foo", verbose=True)

docs = map(lambda doc: [map(lambda tweet: tweet.username, doc.tweets), map(lambda metric: metric[1](doc), metrics)], SimpleDoc.getallBetween((2012,6), (2012,8)))

baseline = { }

for ind, metricName in zip(range(0, len(metricNames)), metricNames):
    pairs = []
    for doc in docs:
        numTweets = len(doc[0])
        metricScore = doc[1][ind]
        pairs.append([numTweets, metricScore])

    x, y = zip(*pairs)
    s, p = stats.spearmanr(x, y)

    baseline[metricName] = s

count = 0
count2 = 0
for ind, metricName in zip(range(0, len(metricNames)), metricNames):
    pairs = []
from main.util.common import SimpleDoc, rankCorrelation, pearsonCorrelation, allCorrelations, Log
import json
import numpy as np
from scipy import stats
import random
from get_experts import getPatrickExperts, metrics, metricNames, getBaseline, minimizedDocs, getWordExperts, correlationWrtUsers, corrComparision, remainNTweets, corrDiffs, getListScore
import math

l = Log(filename="foo", verbose=True)

topCategories = ['Biology', 'Medicine', 'Sports', 'Culture', 'Technology', 'Education', 'Health', 
    'Business', 'Belief', 'Humanities', 'Society', 'Life', 'Arts', 'Language', 'Law', 
    'History', 'Geography', 'Agriculture', 'Politics', 'Mathematics', 'Science', 
    'Nature', 'Environment', 'People', 'Chronology' ]

mendeleyDisciplines = [
    'Linguistics', 'Economics', 'Psychology', 'Humanities', 'Materials Science', 
    'Earth Sciences', 'Environmental Sciences', 'Biological Sciences', 'Medicine', 
    'Mathematics', 'Chemistry', 'Physics', 'Social Sciences', 'Electrical and Electronic Engineering', 
    'Astronomy / Astrophysics / Space Science', 'Sports and Recreation', 
    'Management Science / Operations Research', 'Philosophy', 'Law', 
    'Business Administration', 'Engineering', 'Design', 'Arts and Literature', 
    'Education', 'Computer and Information Science'
]

expertWords = ["university", "ph.d", "ph. d", "ph d", "phd", "professor", "doctor", "dr.", "institute", "postdoc" ]# , "post doc", "student", "research", "prof", "post prad", "science", "scientist", "department", "study", "studies", "develop"]
# expertLists = list(map(lambda s: s.strip(), open("data/expert_topics", "r")))
# expertCategories = ['Medicine', 'Health' ]

wordExperts = getWordExperts(expertWords)
# patrickExperts = getPatrickExperts(expertCategories)
    ("Crossref", lambda doc: doc.numCrossrefs()),
    ("PubMed", lambda doc: doc.pubmedCitations),
    ("Scopus", lambda doc: doc.scopusCitations),
    ("Max Citations", lambda doc: doc.maxCitations()),
    ("PLOS pdf", lambda doc: doc.pdfViews),
    ("PLOS HTML", lambda doc: doc.htmlViews),
    ("PMC pdf", lambda doc: doc.pmcPdf),
    ("PMC HTML", lambda doc: doc.pmcHtml),
    ("Facebook Shares", lambda doc: doc.facebookShares),
    ("Facebook Comments", lambda doc: doc.facebookComments),
    ("Facebook Likes", lambda doc: doc.facebookLikes),
    ("Mendeley Readers", lambda doc: doc.mendeleyReaders),
    ("CiteULike", lambda doc: doc.citeULikeShares),
]

l = Log(filename="foo", verbose=True)

metricNames = map(lambda metric: metric[0], metrics)
docs = map(
    lambda doc: [map(lambda tweet: tweet.username, doc.tweets), map(lambda metric: metric[1](doc), metrics)],
    SimpleDoc.getallBetween((2012, 6), (2012, 8)),
)

negativeUsernames = map(lambda x: x[0], json.load(open("user_exclude_list_negative")))

usersTweetFrequence = {}
for doc in docs:
    for user in doc[0]:
        usersTweetFrequence[user] = usersTweetFrequence.get(user, 0) + 1

lowTweetUsers = set(