class Comparator(object):
    def __init__(self, simdir):
        self.parser = Parser(None)
        # read similar words dictionary
        self.similar_words = json.load(open(simdir))

    def find_similarities(self):
        self.detect_tags()
        i_words = self.input.split(" ")
        similarities = {}

        for iw in i_words:
            if iw in self.similar_words:
                for s in self.similar_words[iw]:
                    if not s[0] in similarities:
                        similarities[s[0]] = s[1]
                    else:
                        similarities[s[0]] += s[1]

        sorted_similarities = sorted(similarities.items(), key=operator.itemgetter(1), reverse=True)

        return sorted_similarities

    def detect_tags(self, input_text):
        tags = {}

        people = {}
        locations = {}
        organizations = {}
        concepts = {}

        print ("Parse Article")
        self.input = self.parser.parse_plain_text(input_text)
        print ("Find Entities in the Article")
        self.entities = self.parser.extract_entities(input_text)
        print "Entities: \n" + str(self.entities)
        for ent in self.entities:
            if self.entities[ent] == "PERSON":
                entity = ent
                people[entity] = 10
            elif self.entities[ent] == "GPE" or self.entities[ent] == "GSP":
                entity = ent
                locations[entity] = 10
            elif self.entities[ent] == "FACILITY" or self.entities[ent] == "ORGANIZATION":
                entity = ent
                organizations[entity] = 10

        print ("Find Frequent Words")
        words = self.calc_freq_words(self.input)
        print ("Find Important Words")
        important_words = self.find_important_words(input_text)
        for word in words:
            if word[1] == 1:
                break  # too infrequent to care

            score = word[1]
            # count frequency
            if word[0] in important_words:
                score *= 2  # double if in the nutgraph of the article
            if word[0] in self.similar_words:

                for s in self.similar_words[word[0]]:

                    if "P" in s[0]:
                        if "O" in s[0]:
                            s[0] = re.sub("/O", "", s[0])
                        if "L" in s[0]:
                            s[0] = re.sub("/L", "", s[0])
                        stripped = re.sub("/P", "", s[0])

                        if stripped in people:
                            people[stripped] += s[1] + score

                        else:
                            people[stripped] = score
                    elif "L" in s[0]:
                        if "O" in s[0]:
                            s[0] = re.sub("/O", "", s[0])
                        stripped = re.sub("/L", "", s[0])

                        if stripped in locations:
                            locations[stripped] += s[1] + score

                        else:
                            locations[stripped] = score
                    elif "O" in s[0]:
                        stripped = re.sub("/O", "", s[0])

                        if stripped in organizations:
                            organizations[stripped] += s[1] + score

                        else:
                            organizations[stripped] = score
                    else:
                        if s[0] in concepts:
                            concepts[s[0]] += s[1] + score
                        else:
                            concepts[s[0]] = score

        print ("Sort Entities and Concepts")
        sorted_people = sorted(people.items(), key=operator.itemgetter(1), reverse=True)
        sorted_orgs = sorted(organizations.items(), key=operator.itemgetter(1), reverse=True)
        sorted_locations = sorted(locations.items(), key=operator.itemgetter(1), reverse=True)
        sorted_concepts = sorted(concepts.items(), key=operator.itemgetter(1), reverse=True)

        tags["people"] = []
        for sp in sorted_people:
            tags["people"].append(sp[0])

        tags["locations"] = []
        for sp in sorted_locations:
            tags["locations"].append(sp[0])

        tags["organizations"] = []
        for sp in sorted_orgs:
            tags["organizations"].append(sp[0])

        tags["concepts"] = []
        for sp in sorted_concepts:
            tags["concepts"].append(sp[0])
            if len(tags["concepts"]) > 500:
                break

        return tags

    def calc_freq_words(self, input_text):
        vocab = {}
        for word in input_text.split(" "):
            if word in vocab:
                vocab[word] += 1
            else:
                vocab[word] = 1

        sorted_freq_words = sorted(vocab.items(), key=operator.itemgetter(1), reverse=True)

        return sorted_freq_words

    def find_important_words(self, input_text):
        """Detect entities that appear in the article's nutgraph"""
        words = input_text.split()
        limit = int(len(words) / 4)

        nutgraph = self.parser.parse_plain_text(" ".join(words[0:limit]))
        nutgraph_entities = self.parser.extract_entities(" ".join(words[0:limit]))

        important = []

        for w in nutgraph.split(" "):
            if w in nutgraph_entities:
                if nutgraph_entities[w] == "PERSON":
                    w = w + "P"
                elif nutgraph_entities[w] == "GPE" or nutgraph_entities[w] == "GSP":
                    w = w + "L"
                elif nutgraph_entities[w] == "FACILITY" or nutgraph_entities[w] == "ORGANIZATION":
                    w = w + "O"

            important.append(w)

        return important